arm_compute v19.05
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 74af99b..cfb36e1 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,10 +47,10 @@
 {
     if(_info == nullptr)
     {
-        return ValidRegion();
+        return ValidRegion{};
     }
 
-    return ValidRegion(Coordinates(), _info->tensor_shape());
+    return ValidRegion{ Coordinates(), _info->tensor_shape() };
 }
 
 void AccessWindowAutoPadding::set_valid_region()
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 18ef185..f4ceca8 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -214,7 +214,10 @@
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
-        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)),
+        WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7)),
+        WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1)),
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)),
     };
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -255,4 +258,11 @@
             return 1;
     }
 }
+
+bool preferred_dummy_work_items_support(const cl::Device &device)
+{
+    ARM_COMPUTE_UNUSED(device);
+    // TODO (COMPMID-2044)
+    return true;
+}
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 4ecb885..df60001 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -190,6 +190,7 @@
     { "compare_lessequal_quantized", "comparisons.cl" },
     { "concatenate_depth", "concatenate.cl" },
     { "concatenate_width", "concatenate.cl" },
+    { "concatenate_height", "concatenate.cl" },
     { "concatenate_width_x2", "concatenate.cl" },
     { "concatenate_width_x4", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
@@ -212,16 +213,18 @@
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
+    { "crop_tensor", "crop_tensor.cl" },
+    { "deconvolution_reshape", "deconvolution_layer.cl" },
     { "deconvolution_upsample", "deconvolution_layer.cl" },
     { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_nhwc", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_dot8_nchw", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_dot8_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
     { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
@@ -258,12 +261,39 @@
     { "elementwise_unary", "elementwise_unary.cl" },
     { "erode", "erode.cl" },
     { "fast_corners", "fast_corners.cl" },
-    { "flatten", "flatten.cl" },
+    { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
+    { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
+    { "fft_radix_2_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_2_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_2_axis_0", "fft.cl" },
+    { "fft_radix_2_axis_1", "fft.cl" },
+    { "fft_radix_3_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_3_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_3_axis_0", "fft.cl" },
+    { "fft_radix_3_axis_1", "fft.cl" },
+    { "fft_radix_4_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_4_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_4_axis_0", "fft.cl" },
+    { "fft_radix_4_axis_1", "fft.cl" },
+    { "fft_radix_5_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_5_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_5_axis_0", "fft.cl" },
+    { "fft_radix_5_axis_1", "fft.cl" },
+    { "fft_radix_7_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_7_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_7_axis_0", "fft.cl" },
+    { "fft_radix_7_axis_1", "fft.cl" },
+    { "fft_radix_8_first_stage_axis_0", "fft.cl" },
+    { "fft_radix_8_first_stage_axis_1", "fft.cl" },
+    { "fft_radix_8_axis_0", "fft.cl" },
+    { "fft_radix_8_axis_1", "fft.cl" },
+    { "fft_scale_conj", "fft_scale.cl" },
     { "fill_image_borders_constant", "fill_border.cl" },
     { "fill_image_borders_replicate", "fill_border.cl" },
     { "finalize", "optical_flow_pyramid_lk.cl" },
-    { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
+    { "flatten", "flatten.cl" },
     { "floor_layer", "floor.cl" },
+    { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
     { "gather", "gather.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
@@ -284,6 +314,8 @@
     { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_transpose1xW", "gemm.cl" },
     { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
@@ -301,6 +333,7 @@
     { "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
     { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
     { "gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8", "gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
@@ -373,6 +406,7 @@
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
     { "permute", "permute.cl" },
+    { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
@@ -470,6 +504,9 @@
     { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" },
+    { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" },
+    { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" },
     { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
     { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" },
     { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
@@ -488,6 +525,9 @@
     { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" },
+    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" },
+    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" },
+    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
     { "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" },
     { "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" },
@@ -503,6 +543,9 @@
     { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
+    { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
+    { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
+    { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
     { "yolo_layer_nchw", "yolo_layer.cl" },
     { "yolo_layer_nhwc", "yolo_layer.cl" },
     { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
@@ -607,6 +650,10 @@
 #include "./cl_kernels/copy_tensor.clembed"
     },
     {
+        "crop_tensor.cl",
+#include "./cl_kernels/crop_tensor.clembed"
+    },
+    {
         "upsample_layer.cl",
 #include "./cl_kernels/upsample_layer.clembed"
     },
@@ -675,14 +722,26 @@
 #include "./cl_kernels/fast_corners.clembed"
     },
     {
-        "flatten.cl",
-#include "./cl_kernels/flatten.clembed"
+        "fft.cl",
+#include "./cl_kernels/fft.clembed"
+    },
+    {
+        "fft_digit_reverse.cl",
+#include "./cl_kernels/fft_digit_reverse.clembed"
+    },
+    {
+        "fft_scale.cl",
+#include "./cl_kernels/fft_scale.clembed"
     },
     {
         "fill_border.cl",
 #include "./cl_kernels/fill_border.clembed"
     },
     {
+        "flatten.cl",
+#include "./cl_kernels/flatten.clembed"
+    },
+    {
         "floor.cl",
 #include "./cl_kernels/floor.clembed"
     },
@@ -1035,7 +1094,7 @@
     return Kernel(kernel_name, cl_program);
 }
 
-void CLKernelLibrary::add_built_program(const std::string &built_program_name, cl::Program program)
+void CLKernelLibrary::add_built_program(const std::string &built_program_name, const cl::Program &program)
 {
     _built_programs_map.emplace(built_program_name, program);
 }
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 995fcb4..2d28a49 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 
 using namespace arm_compute;
 
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint)
+void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
 {
     if(kernel.kernel()() == nullptr)
     {
@@ -58,6 +58,13 @@
         return;
     }
 
+    // Use dummy work-items
+    if(use_dummy_work_items)
+    {
+        gws.get()[0] = get_next_power_two(gws[0]);
+        gws.get()[1] = get_next_power_two(gws[1]);
+    }
+
     cl::NDRange valid_lws;
     if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 6725f36..ef03a5a 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -120,6 +120,9 @@
     LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
     LOAD_FUNCTION_PTR(clWaitForEvents, handle);
 
+    // Third-party extensions
+    LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
+
 #undef LOAD_FUNCTION_PTR
 
     //Don't call dlclose(handle) or all the symbols will be unloaded !
@@ -919,3 +922,27 @@
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_mem
+clImportMemoryARM(cl_context                      context,
+                  cl_mem_flags                    flags,
+                  const cl_import_properties_arm *properties,
+                  void                           *memory,
+                  size_t                          size,
+                  cl_int                         *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
+    if(func != nullptr)
+    {
+        return func(context, flags, properties, memory, size, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index dfd16e0..60307bc 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -341,22 +341,10 @@
     Vector   bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
     Vector   bn_var  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
 
-    // In-place ops
-#ifdef IN_PLACE_W
-    Tensor4D fused_w = conv_w;
-#else  /* IN_PLACE_W */
-    Tensor4D  fused_w                      = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
-#endif /* IN_PLACE */
-#ifdef IN_PLACE_B
-    Vector fused_b = conv_b;
-#else  /* IN_PLACE_W */
-    Vector    fused_b                      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
-#endif /* IN_PLACE */
-
     // Conditional ops
 #ifdef HAS_BIAS
     Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
-#endif /* USE_DEFAULT_BETA */
+#endif /* HAS_BIAS */
 #ifndef USE_DEFAULT_BETA
     Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
 #endif /* USE_DEFAULT_BETA */
@@ -364,6 +352,19 @@
     Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
 #endif /* USE_DEFAULT_GAMMA */
 
+    // In-place ops
+#ifdef IN_PLACE_W
+    Tensor4D fused_w          = conv_w;
+    uint     fused_w_stride_x = conv_w_stride_x;
+#else  /* IN_PLACE_W */
+    Tensor4D  fused_w                      = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE_W */
+#ifdef IN_PLACE_B
+    Vector fused_b = conv_b;
+#else  /* IN_PLACE_B */
+    Vector    fused_b                      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE_B */
+
     const int current_slice = get_global_id(2) / NUM_CHANNELS;
 
 #if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index c374769..e365683 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -132,10 +132,10 @@
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
 
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2)
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
     src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
     src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
     const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
     const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
@@ -330,6 +330,59 @@
 
 #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
 
+#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_height(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    VSTORE(VEC_SIZE)
+    (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
+
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
@@ -356,20 +409,19 @@
 __kernel void concatenate_depth(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
-    int3 offsets)
+    int offset)
 {
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
     source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
     VSTORE(VEC_SIZE)
-    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
-
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
 }
 #endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 4bbbf11..f4366b8 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,7 @@
 }
 #endif // Compile time constants
 
+#if defined(DATA_TYPE)
 /** Performs a copy of input tensor to the output tensor.
  *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -103,6 +104,16 @@
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
@@ -110,4 +121,8 @@
     // Store result
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr));
+#endif // defined(VEC_SIZE)
 }
+#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/crop_tensor.cl
new file mode 100644
index 0000000..55f8544
--- /dev/null
+++ b/src/core/CL/cl_kernels/crop_tensor.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) // Compile time constants
+
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U16/S16/F16/U32/S32/F32
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  in_offset_y                       The initial offset of the input address along Y.
+ * @param[in]  in_offset_z                       The initial offset of the input address along Z.
+ */
+__kernel void crop_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    int in_offset_y,
+    int in_offset_z)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
+
+#if defined(WIDTH_FLIPPED)
+    const int in_y = in_offset_y - get_global_id(1);
+#else  // defined(WIDTH_FLIPPED)
+    const int in_y                 = in_offset_y + get_global_id(1);
+#endif // defined(WIDTH_FLIPPED)
+
+#if defined(HEIGHT_FLIPPED)
+    const int in_z = in_offset_z - get_global_id(2);
+#else  // defined(HEIGHT_FLIPPED)
+    const int in_z                 = in_offset_z + get_global_id(2);
+#endif // defined(HEIGHT_FLIPPED)
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index e5169f9..ea2455c 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,3 +52,79 @@
     // Store result
     *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
 }
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+    // Store result
+    const int x_in = get_global_id(0);
+    const int y_in = get_global_id(1);
+    const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+    const int bias_index = x_in / (FILTER_AREA);
+    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else  // defined(NUM_FILTERS)
+    const int x_out      = x_in / (FILTER_AREA);
+    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+    const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else  // defined(ADD_BIAS)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 4f6fdfa..a8611af 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -24,7 +24,141 @@
 
 #include "helpers.h"
 
-#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#if defined(FUSED_ACTIVATION)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+#include "activation_helpers.h"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
+/** Get the pointer position at a certain offset in x and y direction.
+ *
+ * @param[in] ptr      Pointer to the starting position of the buffer
+ * @param[in] x        Relative X position
+ * @param[in] y        Relative Y position
+ * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
+ *
+ * @return a uchar
+ */
+inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
+{
+    return ptr + x * stride_x + y * stride_y;
+}
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);            \
+        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);            \
+        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);            \
+        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);            \
+        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);            \
+        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);            \
+    })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);                  \
+        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);                  \
+        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);                  \
+        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);                  \
+        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);                  \
+        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);                  \
+        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);                  \
+    })
+
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1);                                 \
+        acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2);                                  \
+        acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2);                                   \
+        acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2);                                 \
+        acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3);                                  \
+        acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3);                                   \
+        acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3);                                 \
+    })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+    ({                                                                                        \
+        acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0);                                  \
+        acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0);                                   \
+        acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0);                                 \
+        acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1);                                  \
+        acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1);                                   \
+        acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1);                                 \
+        acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2);                                  \
+        acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2);                                   \
+        acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2);                                 \
+        acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3);                                  \
+        acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3);                                   \
+        acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3);                                 \
+    })
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
 #if defined(CONV_STRIDE_X)
 
 #if CONV_STRIDE_X == 1
@@ -51,13 +185,18 @@
                                       const float           middle_coeff,
                                       const float           right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
     float4 temp = vload4(0, (__global float *)left_pixel);
 
     float2 left   = CONVERT(temp.s01, float2);
     float2 middle = CONVERT(temp.s12, float2);
     float2 right  = CONVERT(temp.s23, float2);
-
     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
+           + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
+           + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
@@ -74,6 +213,7 @@
                                       const float           middle_coeff,
                                       const float           right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
     float4 temp0 = vload4(0, (__global float *)left_pixel);
     float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
 
@@ -82,6 +222,14 @@
     float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
 
     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+    __global float *left_pixel_float = (__global float *)left_pixel;
+
+    return vload4(0, left_pixel_float).s02 * (float2)left_coeff
+           + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
+           + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
@@ -98,6 +246,7 @@
                                       const float           middle_coeff,
                                       const float           right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
     float4 temp0 = vload4(0, (__global float *)left_pixel);
     float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
 
@@ -106,6 +255,13 @@
     float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
 
     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    __global float *left_pixel_float = (__global float *)left_pixel;
+
+    return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
+           + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
+           + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
@@ -139,8 +295,8 @@
     float2 pixels;
 
     pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
-    pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
-    pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
+    pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
+    pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
 
     return pixels;
 }
@@ -212,65 +368,99 @@
     pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
 #endif //defined(HAS_BIAS)
 
-    vstore2(pixels, 0, (__global float *)dst.ptr);
+    vstore2(ACTIVATION_FUNC(pixels), 0, (__global float *)dst.ptr);
 }
 #endif //defined(CONV_STRIDE_X)
 
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                             \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
-    })
+#if(DILATION_X > 1 || DILATION_Y > 1)
 
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
-    ({                                                             \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
-        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
-        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
-        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
-        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);            \
-        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);            \
-        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);            \
-        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);            \
-        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);            \
-        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);            \
-    })
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
 
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                                   \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
-        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
-    })
+    float2 pixels0 = 0.0f;
 
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
-    ({                                                                   \
-        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
-        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
-        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
-        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
-        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
-        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);                  \
-        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);                  \
-        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);                  \
-        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);                  \
-        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);                  \
-        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);                  \
-        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);                  \
-    })
+    float2 src00_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    float2 src00_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    float2 src10_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    float2 src10_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    float2 src20_left  = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    float2 src20_mid   = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                     const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    float2 pixels0 = 0.0f;
+
+    float3 src00_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    float3 src00_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    float3 src10_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    float3 src10_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    float3 src20_left  = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    float3 src20_mid   = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
 
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
  * stride_x and stride_y are equal to 1
  *
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
+ *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -326,6 +516,7 @@
     __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
     __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
+#if(DILATION_X == 1 && DILATION_Y == 1)
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
     float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
@@ -352,6 +543,19 @@
     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
 
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 1st row
+    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 3rd row
+    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
@@ -363,15 +567,21 @@
     pixels3 += (float2)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels2), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels3), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
  * stride_x and stride_y are equal to 2
  *
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
+ *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -425,6 +635,8 @@
     __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
     __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
     float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
@@ -449,6 +661,14 @@
     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
 
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
@@ -458,11 +678,11 @@
     pixels1 += (float2)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 }
 
-#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
 
 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
 /** Reshape the weights for quantized depthwise convolution
@@ -632,11 +852,12 @@
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -661,7 +882,7 @@
 
     const int src_pixel_linear = get_global_id(1) * STRIDE_X;
     const int full_length      = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
-    const int max_initial_x    = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
+    const int max_initial_x    = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
 
     const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
     const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
@@ -670,9 +891,9 @@
     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
 
-    for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
+    for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
     {
-        for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
+        for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
         {
             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
             {
@@ -728,7 +949,7 @@
 
 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
 #if defined(CONV_STRIDE_X)
 #if CONV_STRIDE_X == 1
 #define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -740,6 +961,86 @@
 #error "Stride not supported"
 #endif /* CONV_STRIDE_X */
 
+#if(DILATION_X > 1 || DILATION_Y > 1)
+
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    half4 pixels0 = 0.0f;
+
+    half4 src00_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    half4 src00_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    half4 src10_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    half4 src10_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    half4 src20_left  = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    half4 src20_mid   = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
+ *
+ * @param[in] src_addr         Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x     Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y     Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset         Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr     Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+                                                                    const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    half4 pixels0 = 0.0f;
+
+    half8 src00_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+    half8 src00_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+    half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+    half8 src10_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+    half8 src10_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+    half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+    half8 src20_left  = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+    half8 src20_mid   = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+    half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+    return pixels0;
+}
+
+#endif // (DILATION_X > 1 && DILATION_Y > 1)
+
 /** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
  *
  * @param[in] left_pixel   Pointer to the left pixel.
@@ -754,6 +1055,8 @@
                                          const half            middle_coeff,
                                          const half            right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
     half8 temp = vload8(0, (__global half *)left_pixel);
 
     half4 left   = CONVERT(temp.s0123, half4);
@@ -761,6 +1064,12 @@
     half4 right  = CONVERT(temp.s2345, half4);
 
     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+    return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
+           + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
+           + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
@@ -777,6 +1086,8 @@
                                          const half            middle_coeff,
                                          const half            right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
     half8 temp0 = vload8(0, (__global half *)left_pixel);
     half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
 
@@ -785,6 +1096,15 @@
     half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
 
     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    __global half *left_pixel_float = (__global half *)left_pixel;
+
+    return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
+           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
+           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
@@ -801,6 +1121,8 @@
                                          const half            middle_coeff,
                                          const half            right_coeff)
 {
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
     half16 temp0 = vload16(0, (__global half *)left_pixel);
 
     half4 left   = CONVERT(temp0.s0369, half4);
@@ -808,6 +1130,15 @@
     half4 right  = CONVERT(temp0.s258B, half4);
 
     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    __global half *left_pixel_float = (__global half *)left_pixel;
+
+    return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
+           + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
+           + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
 }
 
 /** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
@@ -841,8 +1172,8 @@
     half4 pixels;
 
     pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
-    pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
-    pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
+    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
+    pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
 
     return pixels;
 }
@@ -851,6 +1182,12 @@
 
 /** This OpenCL kernel computes the depthwise convolution 3x3
  *
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -875,7 +1212,7 @@
  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
@@ -916,7 +1253,7 @@
     pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
 #endif //defined(HAS_BIAS)
 
-    vstore4(pixels, 0, (__global half *)dst.ptr);
+    vstore4(ACTIVATION_FUNC(pixels), 0, (__global half *)dst.ptr);
 }
 #endif // defined(DEPTH_MULTIPLIER)
 #endif // defined(CONV_STRIDE_X)
@@ -924,6 +1261,12 @@
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
  * when both stride_x and stride_y are equal to 1
  *
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -986,6 +1329,7 @@
     __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
     __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
+#if(DILATION_X == 1 && DILATION_Y == 1)
     // Load the weights
     half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
     half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
@@ -1012,6 +1356,19 @@
     CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
     CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
 
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 1st row
+    pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 3rd row
+    pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
 #ifdef HAS_BIAS
     pixels0 += (half4)bias;
     pixels1 += (half4)bias;
@@ -1019,15 +1376,21 @@
     pixels3 += (half4)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels2), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels3), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
  * when both stride_x and stride_y are equal to 2
  *
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1088,6 +1451,8 @@
     __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
     __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
     // Load the weights
     half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
     half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
@@ -1112,15 +1477,22 @@
     CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
     CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
 
+#else  /* DILATION_X==1 && DILATION_Y==1 */
+    //3x3 Convolution of elements starting in 0th row
+    pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+    //3x3 Convolution of elements starting in 2nd row
+    pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
 #ifdef HAS_BIAS
     pixels0 += (half4)bias;
     pixels1 += (half4)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
 }
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
 
 #if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
 
@@ -1140,8 +1512,12 @@
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
  *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
@@ -1161,7 +1537,7 @@
  * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -1189,9 +1565,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
@@ -1203,7 +1579,7 @@
 
     int  z_coord  = 0;
     int4 offset   = 0;
-    int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+    int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
 
     // We compute 2x1x1 [C,W,H] elements
     VEC_FLOAT acc = 0;
@@ -1236,16 +1612,16 @@
     // z == 1
     // z_coord can be only negative for z = 0 so we do not need to clamp it
     // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
+    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
     offset            = y_offset + (int4)(z_coord * src_stride_z);
     VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
     VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
     VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
 
     // z == 2
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
+    // Offset can be out-of-bound so we need to check if it is greater than max_offset
+    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+    offset            = y_offset + (int4)(z_coord * src_stride_z);
     offset            = min(offset, (int4)max_offset);
     VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
     VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
@@ -1276,21 +1652,26 @@
 #endif /* defined(DST_DEPTH) */
 
     VSTORE(VEC_SIZE)
-    (acc, 0, (__global DATA_TYPE *)(dst_addr));
+    (ACTIVATION_FUNC(acc), 0, (__global DATA_TYPE *)(dst_addr));
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
 #if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
 /** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
  *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
  * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
  * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
  *
- * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
@@ -1310,7 +1691,7 @@
  * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -1338,9 +1719,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int             z        = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
@@ -1476,18 +1857,18 @@
 #endif /* defined(DST_DEPTH) */
 
     VSTORE(VEC_SIZE)
-    (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    (ACTIVATION_FUNC(acc0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
     VSTORE(VEC_SIZE)
-    (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    (ACTIVATION_FUNC(acc1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 
 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
 #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     {
         VSTORE(VEC_SIZE)
-        (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+        (ACTIVATION_FUNC(acc2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
         VSTORE(VEC_SIZE)
-        (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+        (ACTIVATION_FUNC(acc3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
     }
 }
 
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 606af2e..8d145a0 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -53,6 +53,8 @@
 
 #if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
 
+#if DILATION_X == 1
+
 #if CONV_STRIDE_X == 1
 #define GET_VALUES(first_value, left, middle, right)                              \
     ({                                                                            \
@@ -85,6 +87,46 @@
     })
 #endif /* CONV_STRIDE_X */
 
+#else /* DILATION_X == 1 */
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                                     \
+    ({                                                                                   \
+        left   = CONVERT(vload8(0, first_value), int8);                                  \
+        middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8);     \
+        right  = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                                      \
+    ({                                                                                    \
+        int16 temp0 = CONVERT(vload16(0, first_value), int16);                            \
+        left        = CONVERT(temp0.s02468ace, int8);                                     \
+        \
+        temp0  = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16);    \
+        middle = CONVERT(temp0.s02468ace, int8);                                          \
+        \
+        temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
+        right = CONVERT(temp0.s02468ace, int8);                                           \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                                             \
+    ({                                                                                           \
+        int16 temp0 = CONVERT(vload16(0, first_value), int16);                                   \
+        int8  temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8);              \
+        left        = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                  \
+        \
+        temp0  = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16);           \
+        temp1  = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8);    \
+        middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                       \
+        \
+        temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16);        \
+        temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \
+        right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);                        \
+    })
+
+#endif /* CONV_STRIDE_X */
+#endif /* DILATION_X==1 */
+
 /** This function computes the depthwise convolution quantized.
  *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
@@ -117,7 +159,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 
-__kernel void depthwise_convolution_3x3_quantized_nchw(
+__kernel void dwc_3x3_native_qasymm8_nchw(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -151,10 +193,10 @@
 
     int8 values0 = 0;
     int8 sum0    = 0;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     int8 values1 = 0;
     int8 sum1    = 0;
-#endif /* CONV_STRIDE_Y */
+#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
 
     // Row0
     int8 left, middle, right;
@@ -168,44 +210,44 @@
 #endif /* WEIGHTS_OFFSET != 0 */
 
     // Row1
-    GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
+    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
     values0 += left * (int8)(w1.s0);
     values0 += middle * (int8)(w1.s1);
     values0 += right * (int8)(w1.s2);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += left * (int8)(w0.s0);
     values1 += middle * (int8)(w0.s1);
     values1 += right * (int8)(w0.s2);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
 
 #if WEIGHTS_OFFSET != 0
     int8 tmp = left + middle + right;
     sum0 += tmp;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
 #endif /* WEIGHTS_OFFSET != 0 */
 
     // Row2
-    GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
+    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
     values0 += left * (int8)(w2.s0);
     values0 += middle * (int8)(w2.s1);
     values0 += right * (int8)(w2.s2);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += left * (int8)(w1.s0);
     values1 += middle * (int8)(w1.s1);
     values1 += right * (int8)(w1.s2);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
 
 #if WEIGHTS_OFFSET != 0
     tmp = left + middle + right;
     sum0 += tmp;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
 #endif /* WEIGHTS_OFFSET != 0 */
 
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     // Row3
     GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
     values1 += left * (int8)(w2.s0);
@@ -215,20 +257,20 @@
 #if WEIGHTS_OFFSET != 0
     sum1 += left + middle + right;
 #endif /* WEIGHTS_OFFSET != 0 */
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
 
 #if defined(HAS_BIAS)
     values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
 #endif //defined(HAS_BIAS)
 
 #if WEIGHTS_OFFSET != 0
     values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
 #endif /* WEIGHTS_OFFSET != 0 */
 
 #if INPUT_OFFSET != 0
@@ -236,16 +278,16 @@
     ushort3 tmp_we      = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
     sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
     values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
 #endif /* INPUT_OFFSET != 0 */
 
 #if K_OFFSET != 0
     values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
 #endif /* K_OFFSET != 0 */
 
 #if defined(REAL_MULTIPLIER)
@@ -254,7 +296,7 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values0                  = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -264,14 +306,14 @@
     res0        = min(res0, (uchar8)255);
 
     vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
 #if defined(REAL_MULTIPLIER)
 
     values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values1                  = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -281,11 +323,11 @@
     res1        = min(res1, (uchar8)255);
 
     vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
 }
 
 #else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
-
+#if DILATION_X == 1
 #if CONV_STRIDE_X == 1
 #define GET_VALUES(first_value, left, middle, right)                 \
     ({                                                               \
@@ -317,6 +359,43 @@
         right  = (uchar8)(temp0.s258b, temp0.se, temp1.s147);          \
     })
 #endif /* CONV_STRIDE_X */
+#else  /*DILATION_X==1*/
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right)                      \
+    ({                                                                    \
+        left   = vload8(0, first_value);                                  \
+        middle = vload8(0, first_value + DILATION_X * sizeof(uchar));     \
+        right  = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
+    })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right)                              \
+    ({                                                                            \
+        uchar16 temp0 = vload16(0, first_value);                                  \
+        left          = temp0.s02468ace;                                          \
+        temp0         = vload16(0, first_value + DILATION_X * sizeof(uchar));     \
+        middle        = temp0.s02468ace;                                          \
+        temp0         = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
+        right         = temp0.s02468ace;                                          \
+    })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right)                              \
+    ({                                                                            \
+        uchar16 temp0 = vload16(0, first_value);                                  \
+        uchar8  temp1 = vload8(0, (first_value + 16 * sizeof(uchar)));            \
+        left          = (uchar8)(temp0.s0369, temp0.scf, temp1.s25);              \
+        \
+        temp0  = vload16(0, first_value + DILATION_X * sizeof(uchar));            \
+        temp1  = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar)));    \
+        middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25);                     \
+        \
+        temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar));         \
+        temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \
+        right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25);                      \
+    })
+
+#endif /* CONV_STRIDE_X */
+#endif /*DILATION_X==1*/
 /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
  *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
@@ -349,7 +428,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 
-__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
+__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -389,8 +468,8 @@
     int8 sum0    = 0;
 
     GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
-    GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);
-    GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);
+    GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
+    GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
 
 #if WEIGHTS_OFFSET != 0
     sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
@@ -398,7 +477,7 @@
     sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
 #endif /* WEIGHTS_OFFSET != 0 */
 
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     // If conv_stride_y is equals to 1, we compute two output rows
 
     uchar8 left3, middle3, right3;
@@ -412,7 +491,7 @@
     sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
     sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
 #endif /* WEIGHTS_OFFSET != 0 */
-#endif // CONV_STRIDE_Y == 1
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
 
     ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
     ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
@@ -446,7 +525,7 @@
     ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
     values0.s7 += right2.s7 * w2.s2;
 
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
     ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
     values1.s0 += right3.s0 * w2.s2;
@@ -478,20 +557,20 @@
     ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
     ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
     values1.s7 += right3.s7 * w2.s2;
-#endif // CONV_STRIDE_Y == 1
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
 
 #if defined(HAS_BIAS)
     values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
 #endif //defined(HAS_BIAS)
 
 #if WEIGHTS_OFFSET != 0
     values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
 #endif /* WEIGHTS_OFFSET != 0 */
 
 #if INPUT_OFFSET != 0
@@ -499,16 +578,16 @@
     ushort3 tmp_we      = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
     sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
     values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
 #endif /* INPUT_OFFSET != 0 */
 
 #if K_OFFSET != 0
     values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
     values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
 #endif /* K_OFFSET != 0 */
 
 #if defined(REAL_MULTIPLIER)
@@ -527,7 +606,7 @@
     res0        = min(res0, (uchar8)255);
 
     vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
 
 #if defined(REAL_MULTIPLIER)
 
@@ -545,7 +624,7 @@
     res1        = min(res1, (uchar8)255);
 
     vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
 }
 
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -597,9 +676,10 @@
 
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
  *
+ * @note This kernel assumes VEC_SIZE is 4.
  * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -640,7 +720,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  * @param[in] max_offset                            Max offset for the input tensor
  */
-__kernel void depthwise_convolution_3x3_quantized_nhwc(
+__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -654,9 +734,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -668,7 +748,7 @@
 
     int  z_coord = 0;
     int4 offset  = 0;
-    int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
+    int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
 
     // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
     y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
@@ -682,15 +762,19 @@
     VEC_INT acc = 0, sum = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+    uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+    uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+    uchar4  w8     = VLOAD(4)(0, weights_addr + 2 * 16);
+
+    uchar4 w0 = w0_tmp.s0123;
+    uchar4 w1 = w0_tmp.s4567;
+    uchar4 w2 = w0_tmp.s89AB;
+    uchar4 w3 = w0_tmp.sCDEF;
+
+    uchar4 w4 = w1_tmp.s0123;
+    uchar4 w5 = w1_tmp.s4567;
+    uchar4 w6 = w1_tmp.s89AB;
+    uchar4 w7 = w1_tmp.sCDEF;
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -715,16 +799,16 @@
     // z == 1
     // z_coord can be only negative for z = 0 so we do not need to clamp it
     // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord           = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
+    z_coord           = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
     offset            = y_offset + (int4)(z_coord * src_stride_z);
     VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
     VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
     VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
 
     // z == 2
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
+    // Offset can be out-of-bound so we need to check if it is greater than max_offset
+    z_coord           = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+    offset            = y_offset + (int4)(z_coord * src_stride_z);
     offset            = min(offset, (int4)max_offset);
     VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
     VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -766,7 +850,7 @@
 
 #else  // defined(REAL_MULTIPLIER)
 
-    acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc                      = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 #endif // defined(REAL_MULTIPLIER)
 
     acc += (VEC_INT)OUTPUT_OFFSET;
@@ -785,9 +869,10 @@
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
  *
+ * @note This kernel assumes VEC_SIZE is 4.
  * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -829,7 +914,7 @@
  * @param[in] max_offset                            Max offset for the input tensor
  */
 
-__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -843,9 +928,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int             z        = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -874,15 +959,19 @@
     VEC_INT acc3 = 0, sum3 = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+    uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+    uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+    uchar4  w8     = VLOAD(4)(0, weights_addr + 2 * 16);
+
+    uchar4 w0 = w0_tmp.s0123;
+    uchar4 w1 = w0_tmp.s4567;
+    uchar4 w2 = w0_tmp.s89AB;
+    uchar4 w3 = w0_tmp.sCDEF;
+
+    uchar4 w4 = w1_tmp.s0123;
+    uchar4 w5 = w1_tmp.s4567;
+    uchar4 w6 = w1_tmp.s89AB;
+    uchar4 w7 = w1_tmp.sCDEF;
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1020,10 +1109,10 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc2                     = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc3                     = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1110,7 +1199,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  * @param[in] max_offset                            The maximum allowed offset for the input tensor
  */
-__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -1124,9 +1213,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -1255,8 +1344,8 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
 #endif // defined(REAL_MULTIPLIER)
     acc0 += (VEC_INT)OUTPUT_OFFSET;
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 4908bb0..7307700 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,51 +23,68 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+
 /** This performs the dequantization of 8-bit unsigned integers to floating point.
  *
- * @param[in]  input_ptr                             Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  min_max_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Suppported data types: F32.
- * @param[in]  min_max_stride_x                      Stride of the min/max vector in X dimension (in bytes)
- * @param[in]  min_max_step_x                        min_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void dequantization_layer(
     TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    VECTOR_DECLARATION(min_max))
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Vector   min_max = CONVERT_TO_VECTOR_STRUCT(min_max);
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    // min_max_value.s0 = min, min_max_value.s1 = max
-    const float2 min_max_value = vload2(0, (__global float *)min_max.ptr);
-
-    const float4 vmin  = (float4)min_max_value.s0;
-    const float4 scale = (float4)((min_max_value.s1 - min_max_value.s0) / 255.0f);
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
 
     // Load data
-    const uchar4 data = vload4(0, (__global uchar *)input.ptr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
 
     // Dequantize
-    const float4 res = convert_float4(data) * scale + vmin;
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
 
     // Store result
-    vstore4(res, 0, (__global float *)output.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global uchar *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
+
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl
new file mode 100644
index 0000000..0027fd5
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft.cl
@@ -0,0 +1,1771 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in]     phi   The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
+    {                                              \
+        float2 w, tmp;                             \
+        w.x   = native_cos(phi);                   \
+        w.y   = native_sin(phi);                   \
+        tmp.x = (w.x * input.x) - (w.y * input.y); \
+        tmp.y = (w.x * input.y) + (w.y * input.x); \
+        input = tmp;                               \
+    }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1) \
+    {                 \
+        float2 v0;    \
+        v0 = c0;      \
+        c0 = v0 + c1; \
+        c1 = v0 - c1; \
+    }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2)                                  \
+    {                                                      \
+        float2 v0 = c1 + c2;                               \
+        float2 v1 = c1 - c2;                               \
+        c1.x      = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y      = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x      = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y      = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0        = c0 + v0;                               \
+    }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3)  \
+    {                          \
+        float2 v0, v1, v2, v3; \
+        v0   = c0 + c2;        \
+        v1   = c1 + c3;        \
+        v2   = c0 - c2;        \
+        v3.x = c1.y - c3.y;    \
+        v3.y = c3.x - c1.x;    \
+        c0   = v0 + v1;        \
+        c2   = v0 - v1;        \
+        c1   = v2 + v3;        \
+        c3   = v2 - v3;        \
+    }
+
+// radix-5 butterfly unit factors
+#define W5_A 0.30901699437494f
+#define W5_B 0.95105651629515f
+#define W5_C 0.80901699437494f
+#define W5_D 0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4)                 \
+    {                                             \
+        float2 v0, v1, v2, v3, v4;                \
+        v0 = c0;                                  \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
+        c0 = v0 + c1 + c2 + c3 + c4;              \
+        c1 = v0 + v1 + (float2)(v4.y, -v4.x);     \
+        c2 = v0 - v2 + (float2)(v3.y, -v3.x);     \
+        c3 = v0 - v2 + (float2)(-v3.y, v3.x);     \
+        c4 = v0 + v1 + (float2)(-v4.y, v4.x);     \
+    }
+
+// radix-7 butterfly unit factors
+#define W7_A 0.62348980185873f
+#define W7_B 0.78183148246802f
+#define W7_C 0.22252093395631f
+#define W7_D 0.97492791218182f
+#define W7_E 0.90096886790241f
+#define W7_F 0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
+    {                                                                \
+        float2 v0, v1, v2, v3, v4, v5, v6;                           \
+        v0 = c0;                                                     \
+        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
+        c1 = v0 + v1 + (float2)(v4.y, -v4.x);                        \
+        c2 = v0 - v2 + (float2)(v5.y, -v5.x);                        \
+        c3 = v0 - v3 + (float2)(v6.y, -v6.x);                        \
+        c4 = v0 - v3 + (float2)(-v6.y, v6.x);                        \
+        c5 = v0 - v2 + (float2)(-v5.y, v5.x);                        \
+        c6 = v0 + v1 + (float2)(-v4.y, v4.x);                        \
+    }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7)  \
+    {                                          \
+        float2 v0, v1, v2, v3, v4, v5, v6, v7; \
+        float2 s0, s1, s2, s3, s4, s5, s6, s7; \
+        float2 t0, t1, t2;                     \
+        v0   = c0 + c4;                        \
+        v1   = c1 + c5;                        \
+        v2   = c2 + c6;                        \
+        v3   = c3 + c7;                        \
+        v4   = c0 - c4;                        \
+        v5   = c1 - c5;                        \
+        v6   = c2 - c6;                        \
+        v7   = c3 - c7;                        \
+        s0   = v0 + v2;                        \
+        s1   = v1 + v3;                        \
+        s2   = v0 - v2;                        \
+        s3   = v1 - v3;                        \
+        s4.x = v4.x - v6.y;                    \
+        s4.y = v4.y + v6.x;                    \
+        s5.x = v5.x - v7.y;                    \
+        s5.y = v5.y + v7.x;                    \
+        s6.x = v4.x + v6.y;                    \
+        s6.y = v4.y - v6.x;                    \
+        s7.x = v5.x + v7.y;                    \
+        s7.y = v5.y - v7.x;                    \
+        t0.x = -s3.y;                          \
+        t0.y = s3.x;                           \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);    \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);    \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);   \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);    \
+        c0   = s0 + s1;                        \
+        c1   = s6 - t2;                        \
+        c2   = s2 - t0;                        \
+        c3   = s4 - t1;                        \
+        c4   = s0 - s1;                        \
+        c5   = s6 + t2;                        \
+        c6   = s2 + t0;                        \
+        c7   = s4 + t1;                        \
+    }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float4 data = vload4(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 2
+    DFT_2(data.s01, data.s23);
+
+    // Store two complex output values
+    vstore4(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 data1 = vload2(0, (__global float *)input.ptr);
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Compute DFT N = 2
+    DFT_2(data1, data2);
+
+    // Store two complex output values
+    vstore2(data1, 0, (__global float *)output.ptr);
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float4 data0 = vload4(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0.s01, data0.s23, data1.s01);
+
+    // Store three complex output values
+    vstore4(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0, data1, data2);
+
+    // Store three complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float8 data = vload8(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 4
+    DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+    // Store four complex output values
+    vstore8(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+
+    // Compute DFT N = 4
+    DFT_4(data0, data1, data2, data3);
+
+    // Store four complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float8 data0 = vload8(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+    // Store five complex output values
+    vstore8(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0, data1, data2, data3, data4);
+
+    // Store five complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float8 data0 = vload8(0, (__global float *)input.ptr);
+    float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+    // Store seven complex output values
+    vstore8(data0, 0, (__global float *)output.ptr);
+    vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+    // Store seven complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float16 data = vload16(0, (__global float *)input.ptr);
+
+    // Compute DFT N = 8
+    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+    // Store eight complex output values
+    vstore16(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 data0 = vload2(0, (__global float *)input.ptr);
+    float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+    float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+    float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+    float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+    float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+    float2 data7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7, 0));
+
+    // Compute DFT N = 8
+    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+    // Store eight complex output values
+    vstore2(data0, 0, (__global float *)output.ptr);
+    vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global float *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_2_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_2_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_3_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_3_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_4_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_4_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_5_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_5_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_7_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_7_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_8_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+kernel void fft_radix_8_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    float2 c0 = vload2(0, (__global float *)input.ptr);
+    float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+    float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+    // Compute phi
+    float phi = (float)nx * exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global float *)output.ptr);
+    vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/fft_digit_reverse.cl
new file mode 100644
index 0000000..040c284
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft_digit_reverse.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+    const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+    // Load data
+#if VEC_SIZE == 1
+    float data = *((__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+    float2 res  = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+    // Load data
+#if VEC_SIZE == 1
+    float data = *((__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+    float2 data = vload2(0, (__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+    float2 res  = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/fft_scale.cl
new file mode 100644
index 0000000..bf78a26
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft_scale.cl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Computes the fft scale stage
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in]  scale                             Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+    TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+    ,
+    float scale)
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+    Tensor3D dst = src;
+#else  /* IN_PLACE */
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+    // Store result
+#if VEC_SIZE == 1
+    *((__global float *)dst.ptr) = (*(__global float *)src.ptr) / scale;
+#elif VEC_SIZE == 2
+    // Load data
+    float2 data = vload2(0, (__global float *)src.ptr);
+    data /= scale;
+#if defined(CONJ)
+    vstore2((float2)(data.s0, -data.s1), 0, (__global float *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(data, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+#else  // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 4736f80..da94008 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1128,7 +1128,1149 @@
 #endif // defined(TRANSPOSE)
 #endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#define ARM_DOT3(a, b, c)           \
+    ({                              \
+        ARM_DOT2(a, b, c);          \
+        c = fma((a.s2), (b.s2), c); \
+    })
+#define ARM_DOT4(a, b, c)           \
+    ({                              \
+        ARM_DOT3(a, b, c);          \
+        c = fma((a.s3), (b.s3), c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                          IMAGE_DECLARATION(rhs),
+                                          IMAGE_DECLARATION(dst),
+                                          uint lhs_stride_z,
+                                          uint rhs_stride_z,
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                         )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+    zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+    zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+    zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+    zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+    zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+    zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+    zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+    zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        // Load values from RHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        DATA_TYPE a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        DATA_TYPE a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        DATA_TYPE a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        DATA_TYPE a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        // Load values from RHS matrix
+        DATA_TYPE b0 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE b1 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+        DATA_TYPE b2 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+        DATA_TYPE b3 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+        DATA_TYPE b4 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE b5 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE b6 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE b7 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+        DATA_TYPE b8 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE b9 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bA = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bB = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bC = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bD = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bE = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        DATA_TYPE bF = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+        // Accumulate
+        ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+    zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+    zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+    zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+    zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+    zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+    zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+    zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+    zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+    c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+    c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+    c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+    c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+    c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+    c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+    c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+    // Store output block
+    VSTORE(N0)
+    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+    VSTORE(N0)
+    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(N0)
+    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(N0)
+    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(N0)
+    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(N0)
+    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(N0)
+    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(N0)
+    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+    })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+    })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+    })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+    })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+    })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+    })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
+    })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
+    ({                                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                            \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+                                           IMAGE_DECLARATION(rhs),
+                                           IMAGE_DECLARATION(dst),
+                                           uint lhs_stride_z,
+                                           uint rhs_stride_z,
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                          )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+    zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+    zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+    zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+    zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+    zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+    zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+    zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+    zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        LD_RHS_VFMA_M0xN0(0, a, c);
+        LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+        LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+        LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+        LD_RHS_VFMA_M0xN0(4, a, c);
+        LD_RHS_VFMA_M0xN0(5, a, c);
+        LD_RHS_VFMA_M0xN0(6, a, c);
+        LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+        LD_RHS_VFMA_M0xN0(8, a, c);
+        LD_RHS_VFMA_M0xN0(9, a, c);
+        LD_RHS_VFMA_M0xN0(A, a, c);
+        LD_RHS_VFMA_M0xN0(B, a, c);
+        LD_RHS_VFMA_M0xN0(C, a, c);
+        LD_RHS_VFMA_M0xN0(D, a, c);
+        LD_RHS_VFMA_M0xN0(E, a, c);
+        LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin));
+#endif // M0 > 7
+
+        LD_RHS_VFMA_M0xN0(0, a, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+    zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+    zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+    zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+    zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+    zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+    zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+    zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+    zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+    c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+    c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+    c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+    c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+    c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+    c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+    c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+    // Store output block
+    VSTORE(N0)
+    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+    VSTORE(N0)
+    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(N0)
+    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(N0)
+    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(N0)
+    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(N0)
+    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(N0)
+    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(N0)
+    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N)
 
 #if K0 == 2
 #define ARM_DOT_K0(a, b, c)     \
@@ -1248,15 +2390,19 @@
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
  *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
  * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
  * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
  *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
  *
  * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
  *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
@@ -1328,6 +2474,13 @@
 #define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
 
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
     // Compute LHS matrix address
     __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
                                (get_global_id(2) * lhs_stride_z);
@@ -1348,11 +2501,14 @@
     for(int i = 0; i < k; i += K0)
     {
         // Supported cases (M0, K0):
-        // 2,4 - 2,8 - 2,16
-        // 3,4 - 3,8 - 3,16
-        // 4,4 - 4,8 - 4,16
-        // 5,4 - 5,8 - 5,16
-        // 6,4 - 6,8 - 6,16
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
         // Load values from LHS matrix
         VEC_DATA_TYPE(DATA_TYPE, K0)
         a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 0 * LHS_STEP_X * sizeof(DATA_TYPE)));
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 277338b..033b4b4 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1944,7 +1944,7 @@
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
@@ -2099,10 +2099,12 @@
 #error "N0 value not supported"
 #endif // N0 conditions
 
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type .
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
  *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
  * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
  * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
@@ -2112,6 +2114,8 @@
  *  - M0 = 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
  *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
  *
  * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
  *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
@@ -2183,6 +2187,13 @@
 #define RHS_STEP_LOOP (H0)
 #endif // defined(RHS_INTERLEAVE)
 
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
     // Compute LHS matrix address
     __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X + (get_global_id(1) / V0) * (uint)lhs_stride_y + (get_global_id(
                                    2)
@@ -2423,7 +2434,7 @@
 }
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices unsing the dot8 instruction.
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM8 data type using the dot8 instruction.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
  *
@@ -2512,6 +2523,556 @@
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
 
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c)                                           \
+    ({                                                              \
+        ARM_DOT((uchar4)(a, (uchar3)0), (uchar4)(b, (uchar3)0), c); \
+    })
+#define ARM_DOT2(a, b, c)                                           \
+    ({                                                              \
+        ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
+    })
+#define ARM_DOT3(a, b, c)                                         \
+    ({                                                            \
+        ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
+    })
+#define ARM_DOT4(a, b, c) \
+    ({                    \
+        ARM_DOT(a, b, c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c)       \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        ARM_DOT1(a, b, c);      \
+        c += (uint)a.s1 * b.s1; \
+    })
+#define ARM_DOT3(a, b, c)       \
+    ({                          \
+        ARM_DOT2(a, b, c);      \
+        c += (uint)a.s2 * b.s2; \
+    })
+#define ARM_DOT4(a, b, c)       \
+    ({                          \
+        ARM_DOT3(a, b, c);      \
+        c += (uint)a.s3 * b.s3; \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                              IMAGE_DECLARATION(rhs),
+                                              IMAGE_DECLARATION(dst),
+                                              uint lhs_stride_z,
+                                              uint rhs_stride_z,
+                                              uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                              ,
+                                              uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                              ,
+                                              uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                             )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+    zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+    zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+    zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+    zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+    zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+    zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+    zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+    zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(uchar, K0)
+        a0 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0);
+#if M0 > 1
+        VEC_DATA_TYPE(uchar, K0)
+        a1 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1);
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(uchar, K0)
+        a2 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2);
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(uchar, K0)
+        a3 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3);
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(uchar, K0)
+        a4 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4);
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(uchar, K0)
+        a5 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5);
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(uchar, K0)
+        a6 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6);
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(uchar, K0)
+        a7 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7);
+#endif // M0 > 7
+
+        // Load values from RHS matrix
+        VEC_DATA_TYPE(uchar, K0)
+        b0 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 0 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b1 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 1 * RHS_STEP_X);
+#if N0 > 2
+        VEC_DATA_TYPE(uchar, K0)
+        b2 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 2 * RHS_STEP_X);
+#endif // N0 > 2
+#if N0 > 3
+        VEC_DATA_TYPE(uchar, K0)
+        b3 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 3 * RHS_STEP_X);
+#endif // N0 > 3
+#if N0 > 4
+        VEC_DATA_TYPE(uchar, K0)
+        b4 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 4 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b5 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 5 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b6 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 6 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b7 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 7 * RHS_STEP_X);
+#endif // N0 > 4
+#if N0 > 8
+        VEC_DATA_TYPE(uchar, K0)
+        b8 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 8 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b9 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 9 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bA = VLOAD(K0)(0, rhs_ptr + rhs_offset + 10 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bB = VLOAD(K0)(0, rhs_ptr + rhs_offset + 11 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bC = VLOAD(K0)(0, rhs_ptr + rhs_offset + 12 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bD = VLOAD(K0)(0, rhs_ptr + rhs_offset + 13 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bE = VLOAD(K0)(0, rhs_ptr + rhs_offset + 14 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bF = VLOAD(K0)(0, rhs_ptr + rhs_offset + 15 * RHS_STEP_X);
+#endif // N0 > 8
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0;
+        rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+    zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+    zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+    zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+    zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+    zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+    zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+    zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+    zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Store output block
+    VSTORE(N0)
+    (CONVERT_SAT(c0, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+    VSTORE(N0)
+    (CONVERT_SAT(c1, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(N0)
+    (CONVERT_SAT(c2, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(N0)
+    (CONVERT_SAT(c3, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(N0)
+    (CONVERT_SAT(c4, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(N0)
+    (CONVERT_SAT(c5, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(N0)
+    (CONVERT_SAT(c6, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(N0)
+    (CONVERT_SAT(c7, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+
 #if defined(COLS_A)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
  *
@@ -3445,4 +4006,4 @@
     // Store the result
     vstore4(res, 0, dst_addr);
 }
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index 5f66efb..70b8b36 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,26 +31,32 @@
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  epsilon                           Epsilon value
  */
 __kernel void l2_normalize_x(
-    VECTOR_DECLARATION(src),
-    VECTOR_DECLARATION(sum),
-    VECTOR_DECLARATION(dst),
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst),
     DATA_TYPE epsilon)
 {
-    Vector src = CONVERT_TO_VECTOR_STRUCT(src);
-    Vector sum = CONVERT_TO_VECTOR_STRUCT(sum);
-    Vector dst = CONVERT_TO_VECTOR_STRUCT(dst);
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     VEC_DATA_TYPE(DATA_TYPE, 16)
     in = vload16(0, (__global DATA_TYPE *)src.ptr);
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
index 80b34eb..7d8e0ef 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/memset.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,24 +41,27 @@
  * @param[in] value                                The value used to fill the pages of the tensor
  */
 __kernel void memset(
-    IMAGE_DECLARATION(tensor))
+    TENSOR3D_DECLARATION(tensor))
 {
-    Image tensor = CONVERT_TO_IMAGE_STRUCT(tensor);
+    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
 
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
     // Check if access on width gets out of bounds
     // If it does shift access vector to access elements within bounds
     const int xi = (int)(get_global_id(0) * VEC_SIZE);
     tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     data = (DATA_TYPE)(CONSTANT_VALUE);
 
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)tensor.ptr);
-#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+#else  // !defined(VEC_SIZE)
     *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#endif // defined(VEC_SIZE)
 }
 
 #endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index 9fa540e..d0e04b2 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,4 +94,52 @@
     // Store result
     vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
\ No newline at end of file
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    float2 vin1 = vload2(0, (__global float *)in1.ptr);
+    float2 vin2 = vload2(0, (__global float *)in2.ptr);
+
+    // Perform complex multiplication
+    float2 res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+    // Store result
+    vstore2(res, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 198250b..2df22d7 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,21 @@
  */
 #include "helpers.h"
 
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) \
+    VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR(VEC_SIZE) VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_UCHAR(VEC_SIZE));                                               \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
 #if defined(POOL_AVG)
 #define POOL_OP(x, y) ((x) + (y))
 #else /* defined(POOL_AVG) */
@@ -118,8 +133,22 @@
     res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
 #endif /* defined(POOL_AVG) */
 
-    // Store result
-    *(__global uchar *)output.ptr = convert_uchar(res);
+    uchar result_u8 = convert_uchar(res);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_u8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_u8                = convert_uchar_sat(convert_int_rte(out_f32));
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global uchar *)output.ptr = result_u8;
 }
 
 int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
@@ -217,6 +246,11 @@
     vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
 #endif /* defined(POOL_AVG) */
 
+    uchar8 out_u8 = convert_uchar8(vdata);
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(8, out_u8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_u8);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
     // Store result
-    vstore8(convert_uchar8(vdata), 0, (__global uchar *)output.ptr);
-}
\ No newline at end of file
+    vstore8(out_u8, 0, (__global uchar *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
index 80ea540..7ae34ef 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/quantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,53 +23,63 @@
  */
 #include "helpers.h"
 
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+
 /** This performs the quantization of floating point inputs to 8-bit unsigned integers.
  *
- * @param[in]  input_ptr                             Pointer to the source image. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: U8
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  min_max_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
- * @param[in]  min_max_stride_x                      Stride of the min/max vector in X dimension (in bytes)
- * @param[in]  min_max_step_x                        min_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void quantization_layer(
     TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    VECTOR_DECLARATION(min_max))
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    // min_max_value.s0 = min, min_max_value.s1 = max
-    const float2 min_max_value = vload2(0, (__global float *)(min_max_ptr + min_max_offset_first_element_in_bytes));
-
-    const float4 vmin   = (float4)min_max_value.s0;
-    const float4 vrange = (float4)(min_max_value.s1 - min_max_value.s0);
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
 
     // Load data
-    float4 data = vload4(0, (__global float *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
-    // Map float values to range [0.0, 1.0]
-    data = (data - vmin) / vrange;
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset      = OFFSET;
 
-    // Quantize and saturate
-    uchar4 res = convert_uchar4_sat(data * 256.0f);
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE) + voffset, 0, 255);
 
-    // Store result
-    vstore4(res, 0, (__global uchar *)output.ptr);
+    //Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global uchar *)(output.ptr)) = (uchar)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, 0, 255);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
+#endif //defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index b4ede25..2651123 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -307,6 +307,10 @@
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
     res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
+#if defined(COMPLEX)
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
 #if defined(SUM_SQUARE)
     res *= res;
 #endif // defined(SUM_SQUARE)
@@ -320,6 +324,11 @@
         VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
         in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
+#if defined(COMPLEX)
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
+
 #if defined(ARG_MAX)
         uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
         indx             = select(indx, z, cond_conv);
@@ -334,8 +343,11 @@
 #endif // defined(SUM_SQUARE)
 #if defined(PROD)
         res *= in;
-#else  //!defined(PROD)
+#else //!defined(PROD)
         res += in;
+#if defined(COMPLEX)
+        res1 += in1;
+#endif // defined(COMPLEX)
 #endif //defined(PROD)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
@@ -348,6 +360,9 @@
     res /= DEPTH;
 #endif // defined(MEAN)
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#if defined(COMPLEX)
+    vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
+#endif // defined(COMPLEX)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
 #endif /* defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 3b9b1e9..3f203b8 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,18 @@
 
 #if defined(SRC_DIM_Z)
 
+#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
+    ({                                                                                                                             \
+        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
+        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s7 = tmp.s6;                                                                                                           \
+    })
+
 /** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1045,6 +1057,306 @@
     *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp = 0.0f;
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = 0.0f;
+
+    out0.s0 = -w00 / 36.0f;
+    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s7 = w06;
+
+    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1 = 0.0f;
+
+    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2 = 0.0f;
+
+    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3 = 0.0f;
+
+    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4 = 0.0f;
+
+    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5 = 0.0f;
+
+    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6 = 0.0f;
+
+    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7 = 0.0f;
+
+    tmp.s0 = w60;
+    tmp.s1 = w61;
+    tmp.s2 = w62;
+    tmp.s3 = w63;
+    tmp.s4 = w64;
+    tmp.s5 = w65;
+    tmp.s6 = w66;
+
+    OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
 #endif // defined(SRC_DIM_Z)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
@@ -1292,6 +1604,55 @@
                                            dst_step_z,
                                            dst_offset_first_element_in_bytes);
 }
+
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
@@ -1539,4 +1900,53 @@
                                            dst_step_z,
                                            dst_offset_first_element_in_bytes);
 }
+
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 34bf290..630a78b 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,24 @@
         out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
     })
 
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
 #if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
  *
@@ -85,7 +103,7 @@
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
 #else  /* defined(SRC_DEPTH) */
-    const int z = get_global_id(2);
+    const int z              = get_global_id(2);
 #endif /* defined(SRC_DEPTH) */
 
     // Compute input address
@@ -221,7 +239,7 @@
     const int z = (get_global_id(2) * 2) % SRC_DEPTH;
     const int b = (get_global_id(2) * 2) / SRC_DEPTH;
 #else  /* defined(SRC_DEPTH) */
-    const int z = get_global_id(2) * 2;
+    const int       z        = get_global_id(2) * 2;
 #endif /* defined(SRC_DEPTH) */
 
     // Compute input address
@@ -403,7 +421,7 @@
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
 #else  /* defined(SRC_DEPTH) */
-    const int z = get_global_id(2);
+    const int       z        = get_global_id(2);
 #endif /* defined(SRC_DEPTH) */
 
     // Compute input address
@@ -430,7 +448,7 @@
     VEC_DATA_TYPE(DATA_TYPE, 4)
     d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     DATA_TYPE out0 = 0.0f;
@@ -495,7 +513,7 @@
 #if defined(SRC_DEPTH)
     __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
 #else  /* defined(SRC_DEPTH) */
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
 #endif /* defined(SRC_DEPTH) */
 
     uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
@@ -728,14 +746,14 @@
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
 #else  /* defined(SRC_DEPTH) */
-    const int z = get_global_id(2);
+    const int                                z = get_global_id(2);
 #endif /* defined(SRC_DEPTH) */
 
     // Compute input address
 #if defined(SRC_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
 #else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
 #endif /* defined(SRC_DEPTH) */
     src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
@@ -933,7 +951,7 @@
     const int z = get_global_id(2) % NUM_TILES_Y;
     const int b = get_global_id(2) / NUM_TILES_Y;
 #else  /* defined(NUM_TILES_Y) */
-    const int z = get_global_id(2);
+    const int z               = get_global_id(2);
 #endif /* defined(NUM_TILES_Y) */
 
 #if defined(NUM_TILES_Y)
@@ -1010,8 +1028,8 @@
     DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
     DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 #else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
-    int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
+    int4            z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
+    int2            z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
 
     valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);
     valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);
@@ -1021,12 +1039,12 @@
     z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
     z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
 
-    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
-    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
-    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
-    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
-    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
-    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+    DATA_TYPE d00                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+    DATA_TYPE d01                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+    DATA_TYPE d02                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+    DATA_TYPE d03                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+    DATA_TYPE d04                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+    DATA_TYPE d05                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
@@ -1096,7 +1114,7 @@
 #if defined(NUM_TILES_Y)
     __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
 #else  /* defined(NUM_TILES_Y) */
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
 #endif /* defined(NUM_TILES_Y) */
 
     uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
@@ -1333,14 +1351,14 @@
     const int z = get_global_id(2) % NUM_TILES_Y;
     const int b = get_global_id(2) / NUM_TILES_Y;
 #else  /* defined(NUM_TILES_Y) */
-    const int z = get_global_id(2);
+    const int                                z = get_global_id(2);
 #endif /* defined(NUM_TILES_Y) */
 
     // Compute input address
 #if defined(NUM_TILES_Y)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
 #else  /* defined(NUM_TILES_Y) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
 #endif /* defined(NUM_TILES_Y) */
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
@@ -1573,6 +1591,370 @@
     OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(NUM_TILES_Y)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* NUM_TILES_Y */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int       z        = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
+
+    // Compute input address
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Clamp coordinates. This clamp is valid for all columns
+    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
+    int y_coord = y * (int)OUTPUT_TILE_W;
+
+    // Row0
+    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
+    int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
+    int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0);         // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
+
+    // Load the input tile
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * (int)src_stride_z);
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+#else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+
+    // Clamp coordinates. This clamp is valid for all rows
+    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+    // Row0
+    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
+    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
+
+    // Load the input tile
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row1
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row2
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row3
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row4
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row5
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row6
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    // Row7
+    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
+    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
+
+    // Calculate intermediate tensors
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
+
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Store values across the channels
@@ -1981,6 +2363,62 @@
                                                  src_stride_w,
                                                  dst_stride_w);
 }
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
 #endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 
@@ -2313,6 +2751,62 @@
                                                  src_stride_w,
                                                  dst_stride_w);
 }
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index e979978..cffc12d 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -180,6 +180,240 @@
     vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
+
+#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
+    ({                                                               \
+        col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6;                   \
+        col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    int y_in  = get_global_id(1);
+    int x_out = get_global_id(0);
+    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
+    float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else                                                                         /* defined(SRC_DEPTH) */
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                        /* defined(SRC_DEPTH) */
+    offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                      = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Get output address
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                                      = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x2 intermediate tensor
+    VEC_DATA_TYPE(float, 2)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
+    COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
+
+    // Compute the 2x2 output tile
+    VEC_DATA_TYPE(float, 2)
+    out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
+    VEC_DATA_TYPE(float, 2)
+    out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 2))b;
+    out_col1 += (VEC_DATA_TYPE(float, 2))b;
+
+#endif // defined(HAS_BIAS)
+    // Get output address
+#if defined(SRC_DEPTH)
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset      = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+    int2 mult_y = min((int2)dst_size - offset, (int2)1);                           // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    // Store the output tile
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1)     = out_col0_dt.s1;
+    *(__global     DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -227,8 +461,8 @@
     Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
 #else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+    Tensor3D       src                                                                   = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr                                                       = tensor3D_offset(&src, 0, 0, 0);
 #endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the 6x6 or 6x1 tile
@@ -599,7 +833,7 @@
 #if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
 #else                                                                               /* defined(SRC_DEPTH) */
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+    int4       offset                                            = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
 #endif                                                                              /* defined(SRC_DEPTH) */
     offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
@@ -1231,6 +1465,72 @@
 #endif // defined(HAS_BIAS)
                                           );
 }
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1573,6 +1873,72 @@
 #endif // defined(HAS_BIAS)
                                           );
 }
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
new file mode 100644
index 0000000..4597d79
--- /dev/null
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose)
+{
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Configure GEMMLHSMatrixInfo
+    lhs_info.m0         = m0;
+    lhs_info.k0         = k0;
+    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+    lhs_info.interleave = lhs_interleave;
+    lhs_info.transpose  = lhs_transpose;
+
+    // Configure GEMMRHSMatrixInfo
+    rhs_info.n0         = n0;
+    rhs_info.k0         = lhs_info.k0;
+    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+    rhs_info.interleave = rhs_interleave;
+    rhs_info.transpose  = rhs_transpose;
+
+    return std::make_pair(lhs_info, rhs_info);
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000..b791c1c
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedKernelConfigurationBifrost::CLGEMMReshapedKernelConfigurationBifrost(GPUTarget arch)
+    : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
+        }
+    }
+    else
+    {
+        if(n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000..483bab8
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget arch)
+    : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
+                                             unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        if(n > 2048)
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+        return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(m == 1)
+        {
+            const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        if(m == 1)
+        {
+            const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, h0, false, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 58a8d10..aa06d3a 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
         const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
         const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_x * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f5f5a0f..4f44851 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -219,6 +219,6 @@
 
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    return BorderSize{ 0, border, 0, 0 };
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index e677793..d9c7ede 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,9 +39,12 @@
 #include <sstream>
 #include <string>
 
-using namespace arm_compute;
-
-#define MAX_MATRIX_SIZE 81
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int max_matrix_size = 81;
+} // namespace
 
 /****************************************************************************************\
  *                                 Square Convolution                                *
@@ -138,8 +141,8 @@
     // Set build options
     std::set<std::string> build_opts;
 
-    int16_t mat[matrix_size * matrix_size] = { 0 };
-    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
+    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
 
     for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
     {
@@ -173,7 +176,7 @@
 template <unsigned int matrix_size>
 BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
 {
-    return BorderSize(matrix_size / 2, 0);
+    return BorderSize{ matrix_size / 2, 0 };
 }
 
 template <unsigned int matrix_size>
@@ -190,8 +193,8 @@
 
     std::set<std::string> build_opts;
 
-    int16_t mat[matrix_size * matrix_size] = { 0 };
-    memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+    std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
+    memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t));
 
     for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
     {
@@ -264,11 +267,11 @@
 
     uint32_t matrix_size = width * height;
 
-    int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+    std::array<int16_t, max_matrix_size> mat = { 0 };
 
-    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+    memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
 
-    for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+    for(unsigned int j = 0; j < max_matrix_size; j++)
     {
         options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
     }
@@ -328,3 +331,4 @@
 template class arm_compute::CLSeparableConvolutionHorKernel<5>;
 template class arm_compute::CLSeparableConvolutionHorKernel<7>;
 template class arm_compute::CLSeparableConvolutionHorKernel<9>;
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index e14e5da..c87768a 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,38 +37,57 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList())
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(!padding.empty() && output_window != nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+        if(output_window == nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output_window->shape());
+        }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, Window *output_window)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, *input);
 
     // Configure window
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int vec_size_x = 16 / input->element_size();
 
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    if(output_window == nullptr)
+    {
+        // Create and update the window (if needed)
+        Window win = calculate_max_window(*input, Steps(vec_size_x));
 
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal input_access(input, 0, vec_size_x);
+        AccessWindowHorizontal output_access(output, 0, vec_size_x);
 
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
+        bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_pair(err, win);
+    }
+    else
+    {
+        Window win = calculate_max_window(*input);
+        return std::make_pair(Status{}, win);
+    }
 }
 
 std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
@@ -131,14 +150,14 @@
 } // namespace
 
 CLCopyKernel::CLCopyKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _output(nullptr), _output_window(), _has_output_window(false)
 {
 }
 
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, output_window));
 
     _input  = input;
     _output = output;
@@ -147,21 +166,44 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-
     std::pair<Status, Window> win_config;
 
+    const unsigned int vec_size_x = 16 / input->info()->element_size();
+
     if(padding.empty())
     {
+        // Configure window
+        win_config = validate_and_configure_window(input->info(), output->info(), output_window);
+
+        if(output_window != nullptr)
+        {
+            _has_output_window        = true;
+            _output_window            = Window(*output_window);
+            const int  width_x        = output_window->num_iterations(0);
+            const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x);
+            const bool remainder_x    = width_x % vec_size_x > 0;
+
+            if(multi_access_x)
+            {
+                _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
+                win_config.second.set(Window::DimX, Window::Dimension(win_config.second.x().start(), ceil_to_multiple(win_config.second.x().end(), vec_size_x), vec_size_x));
+            }
+
+            build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+            build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(width_x - vec_size_x, 0)));
+        }
+        else
+        {
+            build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+        }
+
         // Build kernel
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
-
-        // Configure window
-        win_config = validate_and_configure_window(input->info(), output->info());
     }
     else
     {
+        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+
         // Add compile time options
         add_padding_as_build_options(padding, build_opts);
 
@@ -185,13 +227,13 @@
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding, Window *output_window)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, output_window));
 
     if(padding.empty())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), output_window).first);
     }
     else
     {
@@ -206,16 +248,33 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
+    Window slice;
 
-    do
+    if(_has_output_window)
     {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        slice            = window.first_slice_window_3D();
+        Window out_slice = _output_window.first_slice_window_3D();
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, slice);
+            add_3D_tensor_argument(idx, _output, out_slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_3D(slice) && _output_window.slide_window_slice_3D(out_slice));
     }
-    while(collapsed.slide_window_slice_3D(slice));
+    else
+    {
+        Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+        slice            = collapsed.first_slice_window_3D();
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, slice);
+            add_3D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(collapsed.slide_window_slice_3D(slice));
+    }
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
new file mode 100644
index 0000000..f8a2456
--- /dev/null
+++ b/src/core/CL/kernels/CLCropKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCropKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <map>
+
+namespace arm_compute
+{
+CLCropKernel::CLCropKernel()
+    : _input(nullptr), _output(nullptr), _start(), _batch_index(0), _extrapolation_value(0)
+{
+}
+
+void CLCropKernel::configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), start, end, batch_index, extrapolation_value, output_window));
+
+    _input               = input;
+    _output              = output;
+    _start               = start;
+    _batch_index         = batch_index;
+    _extrapolation_value = extrapolation_value;
+
+    const int vec_size_x = 4;
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*output->info());
+
+    if(output_window != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *output_window);
+        win = *output_window;
+    }
+
+    const int  output_width_x = win.num_iterations(0);
+    const bool multi_access_x = output_width_x >= vec_size_x;
+    const bool remainder_x    = output_width_x % vec_size_x > 0;
+
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
+    build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("crop_tensor", build_opts.options()));
+}
+
+Status CLCropKernel::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
+{
+    ARM_COMPUTE_UNUSED(extrapolation_value, output_window);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(input->dimension(1)) || start.y >= static_cast<int32_t>(input->dimension(2))
+                                || end.x >= static_cast<int32_t>(input->dimension(1)) || end.y >= static_cast<int32_t>(input->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= input->dimension(3));
+    if(output_window != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_window->x().step() != 1);
+    }
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 3);
+    }
+    return Status{};
+}
+
+void CLCropKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window in_slice = Window();
+    in_slice.use_tensor_dimensions(_input->info()->tensor_shape());
+    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+    in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
+
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, in_slice);
+    add_3D_tensor_argument(idx, _output, window);
+    add_argument(idx, _start.x);
+    add_argument(idx, _start.y);
+    enqueue(queue, *this, window);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
new file mode 100644
index 0000000..71218f5
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+                          const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
+    const DataLayout   data_layout = input_info->data_layout();
+    const unsigned int stride_x    = deconv_info.stride().first;
+    const unsigned int stride_y    = deconv_info.stride().second;
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const bool is_qasymm = is_data_type_quantized_asymmetric(input_info->data_type());
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::S32);
+    if(!is_qasymm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
+
+    if(bias != nullptr)
+    {
+        if(is_qasymm)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, input);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
+    }
+
+    if(output->total_size() != 0)
+    {
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        0, 0, stride_x, stride_y);
+
+        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    const DataLayout data_layout = input_info->data_layout();
+
+    const unsigned int stride_x = deconv_info.stride().first;
+    const unsigned int stride_y = deconv_info.stride().second;
+    const size_t       idx_w    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t       idx_h    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                    0, 0, stride_x, stride_y);
+
+    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+
+    Window win = calculate_max_window(*input);
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
+    : _add_bias(false),
+      _bias(nullptr)
+{
+}
+
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+                                                   const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    const DataLayout data_layout = input_info->data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t     idx_b       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    _input    = input;
+    _output   = output;
+    _add_bias = (bias != nullptr);
+    _bias     = bias;
+
+    const int filter_w = weights_info->dimension(idx_w);
+    const int filter_h = weights_info->dimension(idx_h);
+    const int filter_b = weights_info->dimension(idx_b);
+    const int img_w    = input_info->dimension(idx_w);
+    const int img_h    = input_info->dimension(idx_h);
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DFILTER_WIDTH=" + support::cpp11::to_string(filter_w));
+    build_opts.add_option("-DFILTER_HEIGHT=" + support::cpp11::to_string(filter_h));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(img_w));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(img_h));
+    build_opts.add_option_if(data_layout == DataLayout::NCHW, "-DNUM_FILTERS=" + support::cpp11::to_string(filter_b));
+    build_opts.add_option_if(_add_bias, "-DADD_BIAS");
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_reshape", build_opts.options()));
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "deconvolution_reshape_output_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+                                                    const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
+    return Status{};
+}
+
+void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, collapsed);
+    add_3D_tensor_argument(idx, _output, collapsed);
+    if(_add_bias)
+    {
+        add_1D_tensor_argument(idx, _bias, collapsed);
+    }
+    enqueue(queue, *this, collapsed, lws_hint());
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 3fccc04..1cae371 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,19 +47,13 @@
 {
     ARM_COMPUTE_UNUSED(depth_offset);
 
-    // Configure kernel window
-    const int left_right = (output->dimension(0) - input->dimension(0)) / 2;
-    const int top_bottom = (output->dimension(1) - input->dimension(1)) / 2;
-
     const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-    const unsigned int num_elems_read_per_iteration      = 16 / input->element_size();
-    const unsigned int num_rows_read_per_iteration       = 1;
 
     // The window needs to be based on input as we copy all the depths of input
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
 
-    AccessWindowRectangle  input_access(input, -left_right, -top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -74,30 +68,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) > output->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) > output->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
 
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) - input->dimension(0)) % 2);
-    ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(1) - input->dimension(1)) % 2);
-
     return Status{};
 }
 } // namespace
 
 CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+    : _input(nullptr), _output(nullptr), _depth_offset(0)
 {
 }
 
-BorderSize CLDepthConcatenateLayerKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
 void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -125,10 +109,6 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options()));
 
     // Configure kernel window
-    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
-    // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
@@ -153,16 +133,8 @@
 
     const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
 
-    unsigned int  idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    const cl_int3 offsets =
-    {
-        {
-            static_cast<cl_int>(_left_right),
-            static_cast<cl_int>(_top_bottom),
-            static_cast<cl_int>(offset_to_first_elements_in_bytes),
-        }
-    };
-    _kernel.setArg<cl_int3>(idx, offsets);
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
 
     do
     {
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index eb561fa..02d8c6d 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,19 +43,21 @@
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                          const ActivationLayerInfo &act_info)
+                          const ActivationLayerInfo &act_info, const Size2D dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                                                                                         && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                                                                                         && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                                                                                         && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
                                     "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
 
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+
     const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
 
     if(biases != nullptr)
@@ -74,7 +76,7 @@
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
 
@@ -82,10 +84,10 @@
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                                        GPUTarget gpu_target, std::string &kernel_name)
+                                                        GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
 {
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 
     const unsigned int conv_stride_x = conv_info.stride().first;
@@ -171,12 +173,17 @@
     {
         const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
-        kernel_name                       = is_qasymm ? (std::string("depthwise_convolution_3x3_quantized") + (is_dot8_supported ? "_dot8" : "") + "_nchw") : "depthwise_convolution_3x3";
+        kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3";
+        kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
+        kernel_name += (is_qasymm ? "_nchw" : "");
+
         num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
-        num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1) ? 2 : 1;
+        num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
         num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
         num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
     }
+    num_elems_read_per_iteration_x += (num_elems_read_per_iteration_x - 1) * (dilation.x() - 1);
+    num_elems_read_per_iteration_y += (num_elems_read_per_iteration_y - 1) * (dilation.y() - 1);
 
     // Create window and update padding
     Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
@@ -207,10 +214,10 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info)
+                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation));
 
     bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -228,15 +235,18 @@
     std::string     kernel_name;
     const GPUTarget gpu_target = get_target();
 
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     // Set build options
     CLBuildOptions build_opts;
+    build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
 
     if(is_qasymm)
@@ -256,31 +266,31 @@
 
         if(act_info.enabled())
         {
-            const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-            const int o1    = input->info()->quantization_info().offset;
+            const int a_val = output->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_val = output->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+            const int o1    = output->info()->quantization_info().offset;
 
-            build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
             build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
 
-            if(output != nullptr)
-            {
-                const float s1 = input->info()->quantization_info().scale;
-                const float s2 = output->info()->quantization_info().scale;
-                const int   o2 = output->info()->quantization_info().offset;
-
-                build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-                build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-                if(o1 != o2 || s1 != s2)
-                {
-                    build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
-                    build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
-                }
-            }
+            const float s1 = input->info()->quantization_info().scale;
+            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
         }
     }
+    else
+    {
+        build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+        build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+        build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+        build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
+    }
+
+    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
+    build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
+
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set config_id for enabling LWS tuning
@@ -300,12 +310,11 @@
 }
 
 Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                          unsigned int        depth_multiplier,
-                                                          ActivationLayerInfo act_info, GPUTarget gpu_target)
+                                                          unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
 {
     std::string kernel_name;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation).first);
 
     return Status{};
 }
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 5e5a35c..c31825c 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -42,21 +42,23 @@
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                          const ActivationLayerInfo &act_info)
+                          const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
-                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
 
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
 
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+
     const bool   is_qasymm      = is_data_type_quantized_asymmetric(input->data_type());
     const size_t weights_width  = 3;
     const size_t weights_height = 3;
@@ -89,7 +91,8 @@
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
+        const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+                                             *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
 
@@ -97,13 +100,14 @@
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
-                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     const size_t weights_width  = 3;
     const size_t weights_height = 3;
 
     // Get convolved dimensions
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+                                         *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output,
@@ -112,10 +116,10 @@
                        input->data_type(),
                        input->quantization_info());
 
-    const bool is_qasymm   = is_data_type_quantized_asymmetric(input->data_type());
-    const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_qasymm              = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_stride_1_dilation_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
 
-    const unsigned int num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
+    const unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
     const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
     const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
     const unsigned int num_rows_written_per_iteration   = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
@@ -138,7 +142,7 @@
     }
     else
     {
-        AccessWindowStatic    weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
+        AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
         window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
     }
 
@@ -166,15 +170,17 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info)
+                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation));
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
-    const bool is_qasymm         = is_data_type_quantized_asymmetric(input->info()->data_type());
-    const bool is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_qasymm              = is_data_type_quantized_asymmetric(input->info()->data_type());
+    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
     const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
     _input                              = input;
@@ -182,8 +188,8 @@
     _weights                            = weights;
     _biases                             = biases;
     _conv_stride_y                      = conv_info.stride().second;
-    _num_rows_processed_per_iteration   = is_stride_1 ? 2 : 1;
-    _num_planes_processed_per_iteration = is_stride_1 ? 2 : 1;
+    _num_rows_processed_per_iteration   = is_stride_1_dilation_1 ? 2 : 1;
+    _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
 
     // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
     if(is_dot8_supported && is_qasymm)
@@ -196,11 +202,14 @@
     const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size());
 
     CLBuildOptions build_opts;
+    build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
     build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
     build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
     build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
 
     if(is_qasymm)
     {
@@ -219,37 +228,28 @@
 
         if(act_info.enabled())
         {
-            const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-            const int o1    = input->info()->quantization_info().offset;
+            const int a_val = output->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_val = output->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+            const int o1    = output->info()->quantization_info().offset;
 
-            build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
             build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
 
-            if(output != nullptr)
-            {
-                const float s1 = input->info()->quantization_info().scale;
-                const float s2 = output->info()->quantization_info().scale;
-                const int   o2 = output->info()->quantization_info().offset;
-
-                build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
-                build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
-                if(o1 != o2 || s1 != s2)
-                {
-                    build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
-                    build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
-                }
-            }
+            const float s1 = input->info()->quantization_info().scale;
+            build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+            build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
         }
     }
     else
     {
+        build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+        build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+        build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     }
 
-    if(is_stride_1)
+    if(is_stride_1_dilation_1)
     {
         build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(_num_rows_processed_per_iteration));
         build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
@@ -263,9 +263,24 @@
     build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
                              "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
 
+    std::string kernel_name;
     // Create kernel
-    std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
-                                                                                                                    && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+    if(is_qasymm)
+    {
+        kernel_name = std::string("dwc_3x3_reshaped_qasymm8");
+        kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : "");
+        kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
+        kernel_name += "_nhwc";
+    }
+    else
+    {
+        kernel_name = std::string("depthwise_convolution_3x3_nhwc");
+        kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
+    }
+
+    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
+    build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
+
     ICLKernel::configure_internal(win_config.second);
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -286,13 +301,12 @@
 }
 
 Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                          unsigned int        depth_multiplier,
-                                                          ActivationLayerInfo act_info)
+                                                          unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
                                                               biases != nullptr ? biases->clone().get() : nullptr,
-                                                              output->clone().get(), conv_info, depth_multiplier)
+                                                              output->clone().get(), conv_info, depth_multiplier, dilation)
                                 .first);
 
     return Status{};
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 56e9db5..28d4ff2 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -33,7 +33,6 @@
 #include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/runtime/CL/CLScheduler.h"
 #include <tuple>
 
 using namespace arm_compute;
@@ -45,7 +44,8 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                          const Size2D &dilation)
 {
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
@@ -56,16 +56,18 @@
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != output->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || dilation.y() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     return Status{};
 }
 } // namespace
 
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                        const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
 
     _input  = input;
     _output = output;
@@ -89,6 +91,8 @@
     build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
     build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option("-D" + string_from_data_layout(input->info()->data_layout()));
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
     build_opts.add_option_if_else(is_data_type_quantized_asymmetric(input->info()->data_type()),
@@ -105,9 +109,10 @@
     ICLKernel::configure_internal(win);
 }
 
-Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                         const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
 
     return Status{};
 }
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index d4c1bec..78cc559 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -36,74 +37,78 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    // CLDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    // Configure window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
-    // Update window and padding
-    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
+    return std::make_tuple(Status{}, win);
 }
 } // namespace
 
 CLDequantizationLayerKernel::CLDequantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+    : _input(nullptr), _output(nullptr)
 {
 }
 
-void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
-    _input   = input;
-    _output  = output;
-    _min_max = min_max;
+    _input  = input;
+    _output = output;
+
+    const int  vec_size_x     = 16 / output->info()->element_size();
+    const int  output_width_x = output->info()->tensor_shape().x();
+    const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*output->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options()));
 }
 
-Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
-
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
     return Status{};
 }
 
@@ -115,20 +120,12 @@
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    Window min_max_window = window;
-    min_max_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-    min_max_window.set(Window::DimY, Window::Dimension(0, _min_max->info()->dimension(1), 1));
-    min_max_window.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window min_max_slice = min_max_window.first_slice_window_1D();
-
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument(idx, _min_max, min_max_slice);
         enqueue(queue, *this, slice);
     }
-    while(window_collapsed.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 471b320..12affa9 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -284,7 +284,7 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+    // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
@@ -363,7 +363,7 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     // Output auto inizialitation if not yet initialized
-    // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+    // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output->info(),
                        output_shape,
                        1,
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index 37eeeb7..63c9244 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -281,7 +281,7 @@
 {
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    return BorderSize{ 0, border, 0, 0 };
 }
 
 /** Arithmetic operations with saturation*/
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
new file mode 100644
index 0000000..b04293d
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_UNUSED(idx, config);
+
+    auto_init_if_empty(*output, input->clone()->set_num_channels(2));
+
+    Window win = calculate_max_window(*output, Steps());
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
+    : _input(nullptr), _output(nullptr), _idx(nullptr)
+{
+}
+
+void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
+
+    _input  = input;
+    _output = output;
+    _idx    = idx;
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(input->info()->num_channels()));
+    build_opts.add_option_if(config.conjugate, "-DCONJ");
+    std::string kernel_name = "fft_digit_reverse_axis_" + support::cpp11::to_string(config.axis);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), config);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+
+    return Status{};
+}
+
+void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_1D_tensor_argument(idx, _idx, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
new file mode 100644
index 0000000..83d55b7
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    if(output != nullptr)
+    {
+        auto_init_if_empty(*output, *input);
+    }
+
+    // Setup window steps
+    Steps steps;
+    steps.set(config.axis, config.radix);
+
+    Window win = calculate_max_window(*input, steps);
+    if(output != nullptr)
+    {
+        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTRadixStageKernel::CLFFTRadixStageKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+
+    _input        = input;
+    _output       = output;
+    _run_in_place = (output == nullptr) || (output == input);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+
+    // Create kernel
+    std::string kernel_name = "fft";
+    kernel_name += "_radix_" + support::cpp11::to_string(config.radix);
+    kernel_name += (config.is_first_stage) ? "_first_stage" : "";
+    kernel_name += "_axis_" + support::cpp11::to_string(config.axis);
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments if not the first stage
+    if(!config.is_first_stage)
+    {
+        const unsigned int Ni        = config.Nx * config.radix;
+        const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
+        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        _kernel.setArg<cl_uint>(idx++, config.Nx);
+        _kernel.setArg<cl_uint>(idx++, Ni);
+        _kernel.setArg<cl_float>(idx, exp_const);
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    const bool run_in_place = (output == nullptr) || (output == input);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (run_in_place) ? nullptr : output->clone().get(),
+                                                              config)
+                                .first);
+
+    return Status{};
+}
+
+std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
+{
+    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+}
+
+void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        if(!_run_in_place)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
new file mode 100644
index 0000000..59f1fd7
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
+
+        // CLFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+    }
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTScaleKernel::CLFFTScaleKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+
+    _input        = input;
+    _output       = output;
+    _run_in_place = (output == nullptr) || (output == input);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option_if(config.conjugate, "-DCONJ");
+    std::string kernel_name = "fft_scale_conj";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments
+    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx, config.scale);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config)
+{
+    ARM_COMPUTE_UNUSED(config);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        if(!_run_in_place)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index e14b8a3..150d9b6 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,7 +139,7 @@
                                                   epsilon));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration_x = 16 / conv_weights->info()->element_size();
+    const unsigned int num_elems_processed_per_iteration_x = 4;
     const int          output_width_x                      = conv_weights->info()->tensor_shape().x();
     const bool         multi_access_x                      = (output_width_x / num_elems_processed_per_iteration_x > 0);
 
@@ -216,6 +216,6 @@
     {
         add_1D_tensor_argument(idx, _bn_gamma, vector_slice);
     }
-    enqueue(queue, *this, slice, lws_hint());
+    enqueue(queue, *this, slice);
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index e9be1a6..a8c1704 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -165,7 +165,7 @@
 } // namespace
 
 CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
 {
 }
 
@@ -181,6 +181,7 @@
     _output                   = output;
     _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
     _k                        = gemm_info.k();
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -201,6 +202,9 @@
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -302,7 +306,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint());
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
 }
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
new file mode 100644
index 0000000..923b952
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape1{ input1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*output);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(output->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+    const int m          = gemm_info.m();
+    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input0_access(input0, 0, 0,
+                                     ceil_to_multiple(input0->dimension(0), lhs_info.k0),
+                                     input0->dimension(1) + bottom_pad);
+    AccessWindowStatic input1_access(input1, 0, 0,
+                                     input1->dimension(0),
+                                     input1->dimension(1));
+    AccessWindowStatic output_access(output, 0, 0,
+                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                     output->dimension(1) + bottom_pad);
+
+    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+
+    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
+
+    _input0                   = input0;
+    _input1                   = input1;
+    _output                   = output;
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
+                                                               const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 83af0c6..8fba342 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,6 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
-    ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index b6816ac..8969124 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -168,7 +168,7 @@
 } // namespace
 
 CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
 {
 }
 
@@ -184,6 +184,7 @@
     _output                   = output;
     _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
     _k                        = gemm_info.k();
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -206,6 +207,9 @@
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -308,7 +312,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint());
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
 }
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
new file mode 100644
index 0000000..2437265
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape1{ input1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
+
+    TensorInfo tmp_info(*output);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(output->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+    const int m          = gemm_info.m();
+    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input0_access(input0, 0, 0,
+                                     input0->dimension(0),
+                                     input0->dimension(1) + bottom_pad);
+    AccessWindowStatic input1_access(input1, 0, 0,
+                                     input1->dimension(0),
+                                     input1->dimension(1));
+    AccessWindowStatic output_access(output, 0, 0,
+                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                     output->dimension(1) + bottom_pad);
+
+    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+
+    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
+{
+}
+
+void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, lhs_info, rhs_info, gemm_info));
+
+    _input0                   = input0;
+    _input1                   = input1;
+    _output                   = output;
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+    build_opts.add_option_if(std::abs(1.0f - alpha) > 0.00001f, "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+    std::string kernel_name("gemm_mm_reshaped_only_rhs_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
index bd523c8..3b45b07 100644
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,17 +29,17 @@
 
 void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
 {
-    const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
 
     // Set arguments
-    CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+    CLSeparableConvolution5x5HorKernel::configure(input, output, matrix.data(), border_undefined);
 }
 
 void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
 {
-    const uint32_t scale    = 256;
-    const int16_t  matrix[] = { 1, 4, 6, 4, 1 };
+    const uint32_t scale = 256;
+    const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
 
     // Set arguments
-    CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+    CLSeparableConvolution5x5VertKernel::configure(input, output, matrix.data(), scale, border_undefined);
 }
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 6b729c8..c9c7bf3 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 
 BorderSize CLGaussianPyramidHorKernel::border_size() const
 {
-    return BorderSize(0, 2);
+    return BorderSize{ 0, 2 };
 }
 
 void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
@@ -130,7 +130,7 @@
 
 BorderSize CLGaussianPyramidVertKernel::border_size() const
 {
-    return BorderSize(2, 0);
+    return BorderSize{ 2, 0 };
 }
 
 void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..e3f2a96
--- /dev/null
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+{
+    num_elems_processed_per_iteration = 4;
+    // The window needs to be based on input as we copy all the heights of input
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, height_offset, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _height_offset(0), _num_elems_processed_per_iteration()
+{
+}
+
+Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+    unsigned int num_elems_processed_per_iteration;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), height_offset, output->clone().get(), num_elems_processed_per_iteration).first);
+    return Status{};
+}
+
+void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+
+    _input         = input;
+    _output        = output;
+    _height_offset = height_offset;
+
+    auto win_config = validate_and_configure_window(input->info(), height_offset, output->info(), _num_elems_processed_per_iteration);
+
+    // Add build options
+    CLBuildOptions build_opts;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+        {
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        }
+        case 2:
+        {
+            build_opts.add_option("-DDATA_TYPE=short");
+            break;
+        }
+        case 4:
+        {
+            build_opts.add_option("-DDATA_TYPE=int");
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+            break;
+        }
+    }
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_height", build_opts.options()));
+    // Configure kernel window
+
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_4D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index e33dab0..cb2e294 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -117,7 +117,7 @@
     {
         case 0:
             kernel_name = "x";
-            idx         = num_arguments_per_1D_tensor() * 3;
+            idx         = num_arguments_per_2D_tensor() * 3;
             break;
         case 1:
             kernel_name = "y";
@@ -169,17 +169,17 @@
         case 0:
         {
             window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-            Window in_slice  = window.first_slice_window_1D();
-            Window sum_slice = window_sum.first_slice_window_1D();
+            Window in_slice  = window.first_slice_window_2D();
+            Window sum_slice = window_sum.first_slice_window_2D();
             do
             {
                 unsigned int idx = 0;
-                add_1D_tensor_argument(idx, _input, in_slice);
-                add_1D_tensor_argument(idx, _sum, sum_slice);
-                add_1D_tensor_argument(idx, _output, in_slice);
+                add_2D_tensor_argument(idx, _input, in_slice);
+                add_2D_tensor_argument(idx, _sum, sum_slice);
+                add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice);
             }
-            while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index ab53897..80caf94 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,27 +35,38 @@
 namespace arm_compute
 {
 CLMemsetKernel::CLMemsetKernel()
-    : ICLKernel(), _tensor(nullptr)
+    : ICLKernel(), _tensor(nullptr), _full_window()
 {
 }
 
 void CLMemsetKernel::configure(ICLTensor        *tensor,
-                               const PixelValue &constant_value)
+                               const PixelValue &constant_value,
+                               Window           *window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
     _tensor = tensor;
 
-    const DataType data_type      = tensor->info()->data_type();
-    const int      vec_size_x     = 16 / tensor->info()->element_size();
-    const int      output_width_x = tensor->info()->tensor_shape().x();
-    const bool     multi_access_x = (output_width_x / vec_size_x > 0);
+    const DataType data_type  = tensor->info()->data_type();
+    const int      vec_size_x = 16 / tensor->info()->element_size();
 
     // Create and update the window (if needed)
-    Window win = calculate_max_window(*tensor->info());
+    _full_window = calculate_max_window(*tensor->info());
+    Window win   = _full_window;
+    if(window != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+        win = *window;
+    }
+
+    const int  output_width_x = win.num_iterations(0);
+    const bool multi_access_x = output_width_x >= vec_size_x;
+    const bool remainder_x    = output_width_x % vec_size_x > 0;
+
     if(multi_access_x)
     {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -64,14 +75,18 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("memset", build_opts.options()));
 }
 
-Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value)
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
 {
     ARM_COMPUTE_UNUSED(tensor);
     ARM_COMPUTE_UNUSED(constant_value);
+    if(window != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+    }
     return Status{};
 }
 
@@ -81,15 +96,15 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
     // Collapse all the batches on the third
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice     = collapsed.first_slice_window_2D();
+    Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _tensor, slice);
+        add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice);
     }
-    while(collapsed.slide_window_slice_2D(slice));
+    while(collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index fa7b678..92b5f8d 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -127,7 +127,7 @@
     Iterator output(_output, window_output);
 
     // Reset output
-    execute_window_loop(window_output, [&](const Coordinates & id)
+    execute_window_loop(window_output, [&](const Coordinates &)
     {
         auto *ptr = reinterpret_cast<float *>(output.ptr());
         ptr[0]    = std::numeric_limits<float>::max();
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 286b94e..dda9b16 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,8 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 constexpr unsigned int num_elems_processed_per_iteration = 16;
@@ -274,5 +274,141 @@
 {
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    return BorderSize{ 0, border, 0, 0 };
 }
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
+
+Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    // Auto initialize output if not initialized
+    const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
+    auto_init_if_empty(*output, out_info);
+
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
+    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLComplexPixelWiseMultiplicationKernel::CLComplexPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComplexPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pixelwise_mul_complex"));
+
+    ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLComplexPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input1, slice_input1);
+        add_3D_tensor_argument(idx, _input2, slice_input2);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+
+        collapsed.slide_window_slice_3D(slice_input1);
+        collapsed.slide_window_slice_3D(slice_input2);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
+    return BorderSize{ 0, border, 0, 0 };
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 7081688..7ccbda9 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -78,7 +78,6 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
     }
@@ -201,6 +200,17 @@
     const int pool_pad_top  = pad_stride_info.pad_top();
     const int pool_pad_left = pad_stride_info.pad_left();
 
+    // Set build options
+    CLBuildOptions build_opts;
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    }
+
     // Check output dimensions
     auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
@@ -212,8 +222,6 @@
 
     const DataType data_type = input->info()->data_type();
 
-    // Set build options
-    CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
@@ -222,6 +230,7 @@
     build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
     build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+
     build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
     // Create kernel
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 9028b0f..374b22e 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -36,73 +37,76 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    if((output != nullptr) && (output->total_size() != 0))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::QASYMM8);
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    // Configure window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
-    // Update window and padding
-    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
+    return std::make_tuple(Status{}, win);
 }
 } // namespace
 
 CLQuantizationLayerKernel::CLQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+    : _input(nullptr), _output(nullptr)
 {
 }
 
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
+void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
-    _input   = input;
-    _output  = output;
-    _min_max = min_max;
+    _input  = input;
+    _output = output;
+
+    const int  vec_size_x     = 16 / input->info()->element_size();
+    const int  input_width_x  = input->info()->tensor_shape().x();
+    const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*input->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer", build_opts.options()));
 }
 
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -117,13 +121,9 @@
 
     do
     {
-        Window slice_min_max = slice.shift_dimensions(2);
-        slice_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument(idx, _min_max, slice_min_max);
         enqueue(queue, *this, slice);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 1f4cff3..db4850f 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -47,7 +47,14 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    if(input->num_channels() == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
@@ -55,7 +62,6 @@
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
@@ -78,7 +84,7 @@
     output_shape.set(axis, 1);
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
-    auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
     Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -161,6 +167,7 @@
     build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
     build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
+    build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
 
     switch(op)
     {
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index 46aa074..be2a44b 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -138,7 +138,7 @@
 
 BorderSize CLSobel5x5VertKernel::border_size() const
 {
-    return BorderSize(2, 0);
+    return BorderSize{ 2, 0 };
 }
 
 void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index 0c94e88..a4a20c1 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -141,7 +141,7 @@
 
 BorderSize CLSobel7x7VertKernel::border_size() const
 {
-    return BorderSize(3, 0);
+    return BorderSize{ 3, 0 };
 }
 
 void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 403256b..e2d9881 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,11 +62,12 @@
 
     const double beta_multiplier = std::min(
                                        1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1ll << 31) - 1.0);
-    int input_beta_multiplier, input_beta_left_shift;
+                                       (1LL << 31) - 1.0);
+    int input_beta_multiplier;
+    int input_beta_left_shift;
     quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
 
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
+    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
     const int    diff_min           = -1.f * std::floor(max_input_rescaled);
 
     CLBuildOptions build_opts;
@@ -337,7 +338,7 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.F / 256, 0);
     const bool             is_quantized_asymmetric   = (input->info()->data_type() == DataType::S32);
     const DataType         output_data_type          = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
 
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index c40f3c9..5a6b958 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,7 +115,9 @@
 
     const TensorShape &input_shape = input->info()->tensor_shape();
 
-    Coordinates starts_abs, ends_abs, final_strides;
+    Coordinates starts_abs;
+    Coordinates ends_abs;
+    Coordinates final_strides;
     std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
                                                         input_shape,
                                                         starts, ends, strides,
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index d58cef5..5f266c5 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -35,6 +35,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/tensor_info.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/ToolchainSupport.h"
@@ -111,14 +112,16 @@
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
 
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+    // If input have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info());
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
     {
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
         build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
     }
 
     // Create kernel
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 9cbb713..54edaaf 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -35,6 +35,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/tensor_info.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/ToolchainSupport.h"
@@ -133,18 +134,20 @@
     build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
 
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+    // If input have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info(), input3->info(), input4->info());
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
     {
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
         build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
         build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().scale));
         build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().offset));
         build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
     }
 
     // Create kernel
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 84b5ea2..bf3a00d 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,7 +139,7 @@
 } // namespace
 
 CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
+    : _input(nullptr), _bias(nullptr), _output(nullptr), _is_nhwc(false)
 {
 }
 
@@ -152,9 +152,10 @@
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
 
-    _input  = input;
-    _bias   = bias;
-    _output = output;
+    _input   = input;
+    _bias    = bias;
+    _output  = output;
+    _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
 
     // Compute num_tiles_x
     const Size2D        input_dimensions = winograd_info.input_dimensions;
@@ -253,7 +254,7 @@
         add_1D_tensor_argument(idx1, _bias, slice_biases);
     }
 
-    if(_output->info()->data_layout() == DataLayout::NHWC)
+    if(_is_nhwc)
     {
         unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index d77d9c1..d29c0f7 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,14 +73,15 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
     // Initialize _scaled_output buffer
-    const int width_scaled  = _output->info()->dimension(0);
-    const int height_scaled = _output->info()->dimension(1);
-    const int stride_x      = _info.stride().first;
-    const int stride_y      = _info.stride().second;
-    const int start_x       = _info.pad().first;
-    const int start_y       = _inner_border.second + _info.pad().second;
-    const int end_y         = height_scaled - _info.pad().second;
-    const int end_x         = width_scaled - _inner_border.first - _info.pad().first;
+    const int    width_scaled  = _output->info()->dimension(0);
+    const int    height_scaled = _output->info()->dimension(1);
+    const int    stride_x      = _info.stride().first;
+    const int    stride_y      = _info.stride().second;
+    const int    start_x       = _info.pad().first;
+    const int    start_y       = _inner_border.second + _info.pad().second;
+    const int    end_y         = height_scaled - _info.pad().second;
+    const int    end_x         = width_scaled - _inner_border.first - _info.pad().first;
+    const size_t element_size  = _input->info()->element_size();
 
     std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
 
@@ -93,9 +94,9 @@
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
-        *(reinterpret_cast<float *>(out.ptr())) = *(reinterpret_cast<const float *>(in.ptr()));
+        memcpy(out.ptr(), in.ptr(), element_size);
     },
     in, out);
 }
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index e7b4365..45cce66 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Error.h"
 
+#include <array>
 #include <cstdarg>
 #include <cstdio>
 #include <iostream>
@@ -32,11 +33,11 @@
 
 Status arm_compute::create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args)
 {
-    char out[512];
-    int  offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
-    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    std::array<char, 512> out{ 0 };
+    int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out.data() + offset, out.size() - offset, msg, args);
 
-    return Status(error_code, std::string(out));
+    return Status(error_code, std::string(out.data()));
 }
 
 Status arm_compute::create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...)
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 25ac02e..0af8c7d 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -387,7 +387,7 @@
     return kernel;
 }
 
-const std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
+std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
 {
     enum class ParserStage
     {
@@ -399,7 +399,7 @@
 
     // Define a GLES compute shader parser function
     std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
-    cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string
+    cs_parser = [&](const std::string & src, ParserStage stage, int) -> std::string
     {
         std::string dst;
 
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 69ac50b..49b3954 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    float tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 
@@ -66,7 +66,7 @@
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 #endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 36d1b29..6f70efe 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,28 +38,18 @@
 using namespace arm_compute;
 
 GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+    : _input(nullptr), _output(nullptr), _depth_offset(0)
 {
 }
-
-BorderSize GCDepthConcatenateLayerKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
 void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
 
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) != output->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) != output->info()->dimension(Window::DimY));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
 
     _input        = input;
     _output       = output;
@@ -73,35 +63,20 @@
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
 
-    // Configure kernel window
-    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
-    build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right));
-    build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom));
-
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
 
     unsigned int num_elems_processed_per_iteration = 1;
-    unsigned int num_elems_read_per_iteration      = 1;
-    if(input->info()->data_type() == DataType::F32)
-    {
-        num_elems_processed_per_iteration = 1;
-        num_elems_read_per_iteration      = 1;
-    }
-    else if(input->info()->data_type() == DataType::F16)
+    if(input->info()->data_type() == DataType::F16)
     {
         num_elems_processed_per_iteration = 4;
-        num_elems_read_per_iteration      = 4;
     }
-    const unsigned int num_rows_read_per_iteration = 1;
 
     // The window needs to be based on input as we copy all the depths of input
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
     win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
 
-    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
@@ -118,11 +93,9 @@
 
     _output->set_needs_shifting(true);
 
-    Window slice     = window.first_slice_window_3D();
     Window slice_in  = window.first_slice_window_3D();
     Window slice_out = window.first_slice_window_3D();
 
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
     slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
 
     do
@@ -133,7 +106,7 @@
 
         _kernel.update_shader_params();
 
-        enqueue(*this, slice);
+        enqueue(*this, slice_in);
     }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+    while(window.slide_window_slice_3D(slice_in));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index f225ebd..50171a1 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -333,7 +333,10 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    unsigned int pool_pad_x;
+    unsigned int pool_pad_y;
+    unsigned int pool_stride_x;
+    unsigned int pool_stride_y;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index 4f99455..bfee12c 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,24 +68,24 @@
 {
     ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
 
-    return Size2D(_block_size.width / _cell_size.width,
-                  _block_size.height / _cell_size.height);
+    return Size2D{ _block_size.width / _cell_size.width,
+                   _block_size.height / _cell_size.height };
 }
 
 Size2D HOGInfo::num_cells_per_block_stride() const
 {
     ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
 
-    return Size2D(_block_stride.width / _cell_size.width,
-                  _block_stride.height / _cell_size.height);
+    return Size2D{ _block_stride.width / _cell_size.width,
+                   _block_stride.height / _cell_size.height };
 }
 
 Size2D HOGInfo::num_block_positions_per_image(const Size2D &image_size) const
 {
     ARM_COMPUTE_ERROR_ON(_block_stride.width == 0 || _block_stride.height == 0);
 
-    return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
-                  ((image_size.height - _block_size.height) / _block_stride.height) + 1);
+    return Size2D{ ((image_size.width - _block_size.width) / _block_stride.width) + 1,
+                   ((image_size.height - _block_size.height) / _block_stride.height) + 1 };
 }
 
 const Size2D &HOGInfo::cell_size() const
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index e6c80e8..7cf04b5 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -64,7 +64,7 @@
 
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
-    execute_window_loop(win_src, [&](const Coordinates & id)
+    execute_window_loop(win_src, [&](const Coordinates &)
     {
         memcpy(dst_it.ptr(), src_it.ptr(), line_size);
     },
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index e0c2891..62285e0 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
     Iterator input2(in2, window);
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t input1_val = vld1q_u8(input1.ptr());
         const uint8x16_t input2_val = vld1q_u8(input2.ptr());
@@ -78,7 +78,7 @@
     Iterator input2(in2, window);
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         int16x8x2_t input1_val = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
         int16x8x2_t input2_val = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -93,7 +93,7 @@
     Iterator input2(in2, window);
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t  input1_val = vld1q_u8(input1.ptr());
         const int16x8x2_t input2_val =
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index dae0800..d601adc 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,7 +126,7 @@
     const float16x8_t scale_val  = vdupq_n_f16(1.f - _alpha);
     const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
     },
@@ -271,7 +271,7 @@
     Iterator input(_input, window);
     Iterator accum(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         acc_v16_u8(input.ptr(), accum.ptr());
     },
@@ -314,7 +314,7 @@
     const float32x4_t scale_val  = vdupq_n_f32(1.f - _alpha);
     const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
     },
@@ -353,7 +353,7 @@
     Iterator input(_input, window);
     Iterator accum(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
     },
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index b67396c..8de8db9 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -195,7 +195,7 @@
     const auto a       = static_cast<T>(_act_info.a());
     const auto b       = static_cast<T>(_act_info.b());
 
-    execute_window_loop(win_collapsed, [&](const Coordinates & id)
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
         const auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -319,6 +319,7 @@
     const qasymm8_t        b        = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset);
     const qasymm8_t        const_0  = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset);
     const qasymm8x16_t     vconst_0 = vdupq_n_u8(const_0);
+    const auto             vconst_1 = vdupq_n_f32(1.f);
 
     // Initialise scale/offset for re-quantization
     float       s  = qi_in.scale / qi_out.scale;
@@ -326,7 +327,7 @@
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates & id)
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
         const auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -361,41 +362,20 @@
             }
             else if(act == ActivationFunction::LOGISTIC)
             {
-                const auto scale_in  = vdupq_n_f32(qi_in.scale);
-                const auto off_in    = vdupq_n_f32(qi_in.offset);
-                const auto scale_out = vdupq_n_f32(qi_out.scale);
-                const auto off_out   = vdupq_n_f32(qi_out.offset);
-                const auto vconst_1  = vdupq_n_f32(1.f);
-
-                const auto vin_low        = wrapper::vgetlow(vin);
-                const auto vin_high       = wrapper::vgethigh(vin);
-                uint16x8_t vin_low_u16x8  = wrapper::vmovl(vin_low);
-                uint16x8_t vin_high_u16x8 = wrapper::vmovl(vin_high);
-                // Convert uint16 vectors to uint32 vectors
-                uint32x4_t A_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_low_u16x8));
-                uint32x4_t B_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_low_u16x8));
-                uint32x4_t C_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_high_u16x8));
-                uint32x4_t D_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_high_u16x8));
-                // Convert uint32 vectors to float32 vectors
-                float32x4_t A_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(A_u32x4), off_in), scale_in);
-                float32x4_t B_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(B_u32x4), off_in), scale_in);
-                float32x4_t C_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(C_u32x4), off_in), scale_in);
-                float32x4_t D_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(D_u32x4), off_in), scale_in);
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
                 // Perform activation
-                A_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(A_f32x4))));
-                B_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(B_f32x4))));
-                C_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(C_f32x4))));
-                D_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(D_f32x4))));
-                // Convert float32 vectors to uint32 vectors
-                A_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(A_f32x4, scale_out), off_out));
-                B_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(B_f32x4, scale_out), off_out));
-                C_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(C_f32x4, scale_out), off_out));
-                D_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(D_f32x4, scale_out), off_out));
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                vin_low_u16x8  = wrapper::vcombine(wrapper::vqmovn(A_u32x4), wrapper::vqmovn(B_u32x4));
-                vin_high_u16x8 = wrapper::vcombine(wrapper::vqmovn(C_u32x4), wrapper::vqmovn(D_u32x4));
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = wrapper::vcombine(wrapper::vqmovn(vin_low_u16x8), wrapper::vqmovn(vin_high_u16x8));
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
             }
             else
             {
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index ffa578f..ca79a0a 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -84,7 +84,7 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
             const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
@@ -120,7 +120,7 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
             const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
@@ -165,8 +165,8 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
 
-    const float output_scale    = out->info()->quantization_info().scale;
-    const int   output_offset   = out->info()->quantization_info().offset;
+    const float output_scale  = out->info()->quantization_info().scale;
+    const int   output_offset = out->info()->quantization_info().offset;
 
     const float32x4_t vscale1    = vdupq_n_f32(in1->info()->quantization_info().scale);
     const float32x4_t vscale2    = vdupq_n_f32(in2->info()->quantization_info().scale);
@@ -192,7 +192,7 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
             const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
@@ -234,7 +234,7 @@
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
+#else  //__aarch64__
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
@@ -252,7 +252,7 @@
             for(; x < window_end_x; ++x)
             {
                 const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs), RoundingPolicy::TO_NEAREST_UP);
             }
         },
         broadcast_input, non_broadcast_input, output);
@@ -270,7 +270,7 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
             const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
@@ -311,7 +311,7 @@
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
                         vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
+#else  //__aarch64__
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
                         vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
@@ -330,7 +330,7 @@
             {
                 const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale;
                 const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale;
-                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs), RoundingPolicy::TO_NEAREST_UP);
             }
         },
         input1, input2, output);
@@ -357,7 +357,7 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
         const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
@@ -427,7 +427,7 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
         const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index ff8fb84..45e1562 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -54,7 +55,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t ta1 = vld1q_u8(input1.ptr());
         const uint8x16_t ta2 = vld1q_u8(input2.ptr());
@@ -70,7 +71,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t ta1 = vld1q_u8(input1.ptr());
         const uint8x16_t ta2 = vld1q_u8(input2.ptr());
@@ -80,13 +81,41 @@
     input1, input2, output);
 }
 
+void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), in1->info()->quantization_info());
+        const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), in2->info()->quantization_info());
+
+        const float32x4x4_t ta3 =
+        {
+            {
+                vsubq_f32(ta1.val[0], ta2.val[0]),
+                vsubq_f32(ta1.val[1], ta2.val[1]),
+                vsubq_f32(ta1.val[2], ta2.val[2]),
+                vsubq_f32(ta1.val[3], ta2.val[3]),
+            }
+        };
+
+        const uint8x16_t result = vquantize(ta3, out->info()->quantization_info());
+
+        vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
+    },
+    input1, input2, output);
+}
+
 void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
         const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -110,7 +139,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
         const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -150,7 +179,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
         const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
@@ -173,7 +202,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
         const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
@@ -198,7 +227,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
         int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
@@ -219,7 +248,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
         int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
@@ -240,7 +269,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
         int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -261,7 +290,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
         int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -282,7 +311,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t av_0 = vld1q_u8(input1.ptr());
         const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
@@ -304,7 +333,7 @@
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t av_0 = vld1q_u8(input1.ptr());
         const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
@@ -324,18 +353,34 @@
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
+        && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
+        && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
+        && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
+        && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
+        && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
+        && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
+        && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
+        "You called subtract with the wrong image formats");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP,
+        "Convert policy cannot be WRAP if datatype is QASYMM8");
+
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
             !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+            && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
@@ -413,6 +458,7 @@
         { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
         { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
         { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
+        { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
         { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
         { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
         { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
@@ -469,5 +515,5 @@
 {
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    return BorderSize{ 0, border, 0, 0 };
 }
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
new file mode 100644
index 0000000..6211abc
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_y <= 0);
+
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+        const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
+    : _input(nullptr), _block_shape(nullptr), _output(nullptr), _block_shape_x(), _block_shape_y()
+{
+}
+
+void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
+
+    _input       = input;
+    _block_shape = block_shape;
+    _output      = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICPPKernel::configure(win);
+}
+
+void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+
+    _input         = input;
+    _output        = output;
+    _block_shape_x = block_shape_x;
+    _block_shape_y = block_shape_y;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICPPKernel::configure(win);
+}
+
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
+    return Status{};
+}
+
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+    return Status{};
+}
+
+void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    if(_block_shape != nullptr)
+    {
+        // Retrieve the block shapes dynamically
+        _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
+        _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
+    }
+
+    const int batch_size   = _input->info()->dimension(3);
+    const int r            = (batch_size / (_block_shape_x * _block_shape_y));
+    const int element_size = _input->info()->element_size();
+
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_4D();
+
+    // The slice_out slice does not move
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_out.set(3, Window::Dimension(0, 0, 0));
+
+    int batch_id = 0;
+    // Main loop for NCHW and NHWC
+    if(_input->info()->data_layout() == DataLayout::NCHW)
+    {
+        do
+        {
+            Iterator in(_input, slice_in);
+            execute_window_loop(slice_in, [&](const Coordinates & id)
+            {
+
+                const int x = id.x();
+                const int y = id.y();
+                const int z = id.z();
+
+                const int   w     = batch_id % r;
+                const int   out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
+                const int   out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
+                Coordinates output_coords{ out_x, out_y, z, w };
+                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+            },
+            in);
+            ++batch_id;
+        }
+        while(window.slide_window_slice_3D(slice_in));
+    }
+    else
+    {
+        do
+        {
+            Iterator in(_input, slice_in);
+            execute_window_loop(slice_in, [&](const Coordinates & id)
+            {
+
+                const int z = id.x();
+                const int x = id.y();
+                const int y = id.z();
+
+                const int   w     = batch_id % r;
+                const int   out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
+                const int   out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
+                Coordinates output_coords{ z, out_x, out_y, w };
+                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+            },
+            in);
+            ++batch_id;
+        }
+        while(window.slide_window_slice_3D(slice_in));
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index ed83286..71312a9 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,7 +106,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
     },
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 08d7fe2..5791dcc 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,7 +89,7 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         bitwise_not_U8_U8(input.ptr(), output.ptr());
     },
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 1b17cc2..8aed9bb 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,7 +102,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
     },
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index 9451e8a..e2dcb95 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,7 +98,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
     },
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 0c97005..7a53f93 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
 
     const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -160,7 +160,7 @@
 
     const float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index fa51a7b..8d822bd 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -950,7 +950,7 @@
     Iterator magnitude(_magnitude, window);
     Iterator phase(_phase, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         (*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
     },
@@ -1034,7 +1034,7 @@
     const size_t input1_stride        = _magnitude->info()->strides_in_bytes()[1];
     const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         (*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
     },
@@ -1113,7 +1113,7 @@
     const size_t input_stride  = _input->info()->strides_in_bytes()[1];
     const size_t output_stride = _output->info()->strides_in_bytes()[1];
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
     },
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 28fb4bd..539154d 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -286,7 +286,7 @@
     Iterator p2(_planes[2], win);
     Iterator out(_output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
@@ -315,7 +315,7 @@
     Iterator p3(_planes[3], win);
     Iterator out(_output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
@@ -353,7 +353,7 @@
 
     constexpr auto shift = is_uyvy ? 1 : 0;
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
@@ -409,7 +409,7 @@
     // Increase step size after iterator is created to calculate stride correctly for multi channel format
     out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
 
-    execute_window_loop(out_win, [&](const Coordinates & id)
+    execute_window_loop(out_win, [&](const Coordinates &)
     {
         const uint8x8x2_t pixels =
         {
@@ -444,7 +444,7 @@
     Iterator in(_planes[plane_id], tmp_win);
     Iterator out(_output_multi->plane(plane_id), tmp_win);
 
-    execute_window_loop(tmp_win, [&](const Coordinates & id)
+    execute_window_loop(tmp_win, [&](const Coordinates &)
     {
         const uint8x8_t pixels = vld1_u8(in.ptr());
 
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 98b2f28..61e1304 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -192,7 +192,7 @@
     Iterator in(_input, win);
     Iterator out(_output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -207,7 +207,7 @@
     Iterator in(_input, win);
     Iterator out(_output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -222,7 +222,7 @@
     Iterator in(_input, win);
     Iterator out(_output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -242,7 +242,7 @@
     Iterator in(_input, win);
     Iterator out(_output, win_out);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 0a10546..b154340 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -317,7 +317,7 @@
 template <unsigned int matrix_size>
 BorderSize             NEConvolutionKernel<matrix_size>::border_size() const
 {
-    return BorderSize(matrix_size / 2);
+    return BorderSize{ matrix_size / 2 };
 }
 
 template <unsigned int matrix_size>
@@ -388,7 +388,7 @@
     const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
     const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4_t out  = vdupq_n_s32(0);
         int32x4_t out2 = vdupq_n_s32(0);
@@ -437,7 +437,7 @@
     const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
     const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4_t out  = vdupq_n_s32(0);
         int32x4_t out2 = vdupq_n_s32(0);
@@ -496,7 +496,7 @@
     const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
     const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4_t out  = vdupq_n_s32(0);
         int32x4_t out2 = vdupq_n_s32(0);
@@ -565,7 +565,7 @@
     const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
     const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4_t out  = vdupq_n_s32(0);
         int32x4_t out2 = vdupq_n_s32(0);
@@ -728,7 +728,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -761,7 +761,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -794,7 +794,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -840,7 +840,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -875,7 +875,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -910,7 +910,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -962,7 +962,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -999,7 +999,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -1036,7 +1036,7 @@
     Iterator input(_input, win_in);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -1096,7 +1096,7 @@
 template <unsigned int matrix_size>
 BorderSize             NESeparableConvolutionVertKernel<matrix_size>::border_size() const
 {
-    return BorderSize(matrix_size / 2, 0);
+    return BorderSize{ matrix_size / 2, 0 };
 }
 
 template <unsigned int matrix_size>
@@ -1209,7 +1209,7 @@
         input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         uint16x8_t out0 = vdupq_n_u16(0);
         uint16x8_t out1 = vdupq_n_u16(0);
@@ -1275,7 +1275,7 @@
         input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int16x8_t out0 = vdupq_n_s16(0);
         int16x8_t out1 = vdupq_n_s16(0);
@@ -1343,7 +1343,7 @@
 
     const int32x4_t zero = vdupq_n_s32(0);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4x2_t out0 =
         {
@@ -1576,7 +1576,7 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int32x4_t out  = vdupq_n_s32(0);
         int32x4_t out2 = vdupq_n_s32(0);
diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index 20496ad..4722c05 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,6 @@
 Status NECopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     return Status{};
 }
@@ -71,7 +70,7 @@
         Iterator input_it(_input, out_slice);
         Iterator output_it(_output, out_slice);
 
-        execute_window_loop(out_slice, [&](const Coordinates & id)
+        execute_window_loop(out_slice, [&](const Coordinates &)
         {
             memcpy(output_it.ptr(), input_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size());
         },
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
new file mode 100644
index 0000000..f16eb3e
--- /dev/null
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECropKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+inline float32x4_t load_as_f32(T *ptr)
+{
+    ARM_COMPUTE_UNUSED(ptr);
+    ARM_COMPUTE_ERROR("Type not supported.");
+}
+
+template <>
+inline float32x4_t load_as_f32(float *ptr)
+{
+    return wrapper::vloadq(ptr);
+}
+
+template <>
+inline float32x4_t load_as_f32(int32_t *ptr)
+{
+    return vcvtq_f32_s32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint32_t *ptr)
+{
+    return vcvtq_f32_u32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(int16_t *ptr)
+{
+    return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint16_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float32x4_t load_as_f32(float16_t *ptr)
+{
+    return vcvt_f32_f16(wrapper::vload(ptr));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T, bool input_has_single_channel, bool is_width_flipped>
+inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+                                  int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+{
+    // Reverse elements if width flipped.
+    if(is_width_flipped)
+    {
+        // Collapse first dimension if possible.
+        if(input_has_single_channel)
+        {
+            int32_t     x = output_width_start;
+            Coordinates negative_offset(input_offset);
+            negative_offset.set(1, negative_offset[1] - window_step_x + 1);
+            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
+
+                in = wrapper::vrev64(in);
+                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+
+                wrapper::vstore(output_ptr + x, in);
+            }
+            input_offset[1] = negative_offset[1] + window_step_x - 1;
+            for(; x < output_width_limit; ++x, --input_offset[1])
+            {
+                *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+        else
+        {
+            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            {
+                input_offset.set(0, 0);
+                int32_t c = 0;
+                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+                {
+                    auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
+                }
+                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                {
+                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                }
+            }
+        }
+    }
+    else
+    {
+        // Use memcpy if the elements don't need converting to float.
+        if(std::is_same<T, float>::value)
+        {
+            memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
+                   reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+        }
+        else
+        {
+            int32_t x                = 0;
+            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            {
+                auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                wrapper::vstore(output_start_ptr + x, in);
+            }
+            for(; x < limit; ++x, ++input_offset[0])
+            {
+                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+            }
+        }
+    }
+}
+
+inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
+                                      int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+{
+    auto    in               = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+    int32_t x                = 0;
+    int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+    float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+    for(; x <= limit - window_step_x; x += window_step_x)
+    {
+        wrapper::vstore(output_start_ptr + x, in);
+    }
+    for(; x < limit; ++x)
+    {
+        *(output_start_ptr + x) = extrapolation_value;
+    }
+}
+
+template <bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after>
+inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
+                           const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function)
+{
+    // Output is always float.
+    const int window_step_x = 16 / sizeof(float);
+    auto     *output_ptr    = reinterpret_cast<float *>(output->buffer());
+    //  Output window:
+    //  --------------------------------
+    //  |          Out of bounds       |
+    //  |          rows before         |
+    //  |------------------------------|
+    //  | Out of | In         | Out of |
+    //  | bounds | bounds     | bounds |
+    //  | cols   | elements   | cols   |
+    //  | before | copied     | after  |
+    //  |        | from input |        |
+    //  --------------------------------
+    //  |        Out of bounds         |
+    //  |        rows after            |
+    //  |------------------------------|
+    // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
+    // First for the rows before the in bounds rows.
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+    output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
+    // Iterate through each row that has any elements within the input bounds.
+    for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+        ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+    {
+        // Fill all elements in the row that are out of bounds with the extrapolation value.
+        // First for the elements before the in bounds elements.
+        if(has_cols_out_of_bounds_before)
+        {
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
+        }
+        // Copy all elements within the input bounds from the input tensor.
+        if(has_cols_in_bounds)
+        {
+            (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1]);
+        }
+        // Fill all elements after the in bounds elements with the extrapolation value.
+        if(has_cols_out_of_bounds_after)
+        {
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+        }
+        output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
+    }
+    // Fill all rows after the in bounds elements with the extrapolation value.
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+}
+} // namespace
+
+NECropKernel::NECropKernel()
+    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
+      _in_bounds_crop_functions(), _in_bounds_crop_function(nullptr), _crop_function(nullptr)
+{
+}
+
+void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+
+    _input               = input;
+    _crop_boxes          = crop_boxes;
+    _box_ind             = box_ind;
+    _output              = output;
+    _crop_box_ind        = crop_box_ind;
+    _extrapolation_value = extrapolation_value;
+
+    const static std::map<std::pair<DataType, bool>, std::pair<NECropKernel::InBoundsCropFunction *, NECropKernel::InBoundsCropFunction *>> in_map_function =
+    {
+        { { DataType::F32, false }, { &in_bounds_crop_window<float, false, false>, &in_bounds_crop_window<float, false, true> } },
+        { { DataType::F32, true }, { &in_bounds_crop_window<float, true, false>, &in_bounds_crop_window<float, true, true> } },
+        { { DataType::U16, false }, { &in_bounds_crop_window<uint16_t, false, false>, &in_bounds_crop_window<uint16_t, false, true> } },
+        { { DataType::U16, true }, { &in_bounds_crop_window<uint16_t, true, false>, &in_bounds_crop_window<uint16_t, true, true> } },
+        { { DataType::S16, false }, { &in_bounds_crop_window<int16_t, false, false>, &in_bounds_crop_window<int16_t, false, true> } },
+        { { DataType::S16, true }, { &in_bounds_crop_window<int16_t, true, false>, &in_bounds_crop_window<int16_t, true, true> } },
+        { { DataType::U32, false }, { &in_bounds_crop_window<uint32_t, false, false>, &in_bounds_crop_window<uint32_t, false, true> } },
+        { { DataType::U32, true }, { &in_bounds_crop_window<uint32_t, true, false>, &in_bounds_crop_window<uint32_t, true, true> } },
+        { { DataType::S32, false }, { &in_bounds_crop_window<int32_t, false, false>, &in_bounds_crop_window<int32_t, false, true> } },
+        { { DataType::S32, true }, { &in_bounds_crop_window<int32_t, true, false>, &in_bounds_crop_window<int32_t, true, true> } },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { { DataType::F16, false }, { &in_bounds_crop_window<float16_t, false, false>, &in_bounds_crop_window<float16_t, false, true> } },
+        { { DataType::F16, false }, { &in_bounds_crop_window<float16_t, true, false>, &in_bounds_crop_window<float16_t, true, true> } }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    };
+
+    auto in_it = in_map_function.find({ input->info()->data_type(), input->info()->dimension(0) == 1 });
+
+    if(in_it != in_map_function.end())
+    {
+        _in_bounds_crop_functions = in_it->second;
+    }
+}
+
+Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+{
+    ARM_COMPUTE_UNUSED(extrapolation_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
+    ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->has_padding());
+    }
+    return Status{};
+}
+
+void NECropKernel::configure_output_shape()
+{
+    // _crop_box_ind is used to index _crop_boxes and retrieve the appropriate crop box.
+    // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+    const float x0 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(1, _crop_box_ind)));
+    const float y0 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(0, _crop_box_ind)));
+    const float x1 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(3, _crop_box_ind)));
+    const float y1 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(2, _crop_box_ind)));
+    // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
+    _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                         std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                       std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+    _output->info()->set_tensor_shape(out_shape);
+
+    _in_bounds_crop_function = _start[0] <= _end[0] ? _in_bounds_crop_functions.first : _in_bounds_crop_functions.second;
+
+    bool is_width_flipped  = _end[0] < _start[0];
+    bool is_height_flipped = _end[1] < _start[1];
+    if(is_height_flipped)
+    {
+        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+                                                                                                            static_cast<uint32_t>(_output->info()->dimension(2))) :
+                                 0;
+        _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
+                                                        static_cast<uint32_t>(_output->info()->dimension(2))) :
+                                 0;
+    }
+    else
+    {
+        _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
+                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
+                                 0;
+        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+                                                                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
+                                 0;
+    }
+    if(is_width_flipped)
+    {
+        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+                                                                                                            static_cast<uint32_t>(_output->info()->dimension(1))) :
+                                 0;
+        _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
+                                                        static_cast<uint32_t>(_output->info()->dimension(1))) :
+                                 0;
+    }
+    else
+    {
+        _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
+                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
+                                 0;
+        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+                                                                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
+                                 0;
+    }
+
+    const static std::map<std::tuple<bool, bool, bool, bool>, NECropKernel::CropFunction *> map_function =
+    {
+        { std::make_tuple(false, false, false, false), &execute_window<false, false, false, false> },
+        { std::make_tuple(false, false, false, true), &execute_window<false, false, false, true> },
+        { std::make_tuple(false, false, true, false), &execute_window<false, false, true, false> },
+        { std::make_tuple(false, false, true, true), &execute_window<false, false, true, true> },
+        { std::make_tuple(false, true, false, false), &execute_window<false, true, false, false> },
+        { std::make_tuple(false, true, false, true), &execute_window<false, true, false, true> },
+        { std::make_tuple(false, true, true, false), &execute_window<false, true, true, false> },
+        { std::make_tuple(false, true, true, true), &execute_window<false, true, true, true> },
+        { std::make_tuple(true, false, false, false), &execute_window<true, false, false, false> },
+        { std::make_tuple(true, false, false, true), &execute_window<true, false, false, true> },
+        { std::make_tuple(true, false, true, false), &execute_window<true, false, true, false> },
+        { std::make_tuple(true, false, true, true), &execute_window<true, false, true, true> },
+        { std::make_tuple(true, true, false, false), &execute_window<true, true, false, false> },
+        { std::make_tuple(true, true, false, true), &execute_window<true, true, false, true> },
+        { std::make_tuple(true, true, true, false), &execute_window<true, true, true, false> },
+        { std::make_tuple(true, true, true, true), &execute_window<true, true, true, true> },
+    };
+
+    auto it = map_function.find(std::make_tuple(is_height_flipped,
+                                                _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1),
+                                                _cols_out_of_bounds[0] > 0,
+                                                _cols_out_of_bounds[1] > 0));
+
+    if(it != map_function.end())
+    {
+        _crop_function = it->second;
+    }
+
+    INEKernel::configure(calculate_max_window(*_output->info()));
+}
+
+void NECropKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(window, info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
+    ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
+
+    uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
+    Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+                             _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+    (*_crop_function)(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 8352c94..b360e9e 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -42,18 +42,13 @@
 namespace
 {
 template <typename T>
-void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
+void depth_concat(const ITensor *in, ITensor *out, int depth_offset, const Window &window)
 {
-    const int start_x = start_xy.first;
-    const int start_y = start_xy.second;
-
     // Offset input
-    const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
-    uint8_t *input_ptr                               = in->buffer() + input_offset_to_first_elements_in_bytes;
+    uint8_t *input_ptr = in->buffer() + in->info()->offset_first_element_in_bytes();
 
     // Offset output
-    const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
-    uint8_t           *output_ptr                               = out->buffer() + output_offset_to_first_elements_in_bytes;
+    uint8_t *output_ptr = out->buffer() + out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
 
     Iterator input(in, window);
     Iterator output(out, window);
@@ -88,19 +83,13 @@
 {
     ARM_COMPUTE_UNUSED(depth_offset);
 
-    // Configure kernel window
-    const int left_right = (output->dimension(0) - input->dimension(0)) / 2;
-    const int top_bottom = (output->dimension(1) - input->dimension(1)) / 2;
-
     const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-    const unsigned int num_elems_read_per_iteration      = 16 / input->element_size();
-    const unsigned int num_rows_read_per_iteration       = 1;
 
     // The window needs to be based on input as we copy all the depths of input
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
 
-    AccessWindowRectangle  input_access(input, -left_right, -top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -116,30 +105,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) > output->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) > output->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
 
-    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
-    // Otherwise it is not clear how the padding should be added onto the input tensor
-    ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) - input->dimension(0)) % 2);
-    ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(1) - input->dimension(1)) % 2);
-
     return Status{};
 }
 } // namespace
 
 NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _depth_offset(0)
 {
 }
 
-BorderSize NEDepthConcatenateLayerKernel::border_size() const
-{
-    return BorderSize(_top_bottom, _left_right);
-}
-
 void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -149,8 +128,6 @@
     _input        = input;
     _output       = output;
     _depth_offset = depth_offset;
-    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
-    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
     switch(input->info()->data_type())
     {
@@ -190,5 +167,5 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
+    (*_func)(_input, _output, _depth_offset, window);
 }
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 5433755..cbc90a0 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,7 +148,7 @@
                     const float32x4_t scale  = vdupq_n_f32(_input->info()->quantization_info().scale);
                     const int32x4_t   offset = vdupq_n_s32(_input->info()->quantization_info().offset);
 
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint8x16_t   texels_u8 = vld1q_u8(input.ptr());
                         const uint16x8x2_t texels_u16 =
@@ -184,7 +184,7 @@
                     const float16x8_t scale  = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale));
                     const int16x8_t   offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset));
 
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint8x16_t  texels_u8 = vld1q_u8(input.ptr());
                         const int16x8x2_t texels_s16 =
@@ -216,7 +216,7 @@
                 case DataType::S16:
                 {
                     /* Up-conversion U8 -> S16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
 
@@ -237,7 +237,7 @@
                 case DataType::S32:
                 {
                     /* Up-conversion U8 -> S32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
 
@@ -260,7 +260,7 @@
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
 
@@ -294,7 +294,7 @@
                     /* Down-conversion S16 -> U8 */
                     if(ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(window, [&](const Coordinates & id)
+                        execute_window_loop(window, [&](const Coordinates &)
                         {
                             const int16x8x2_t texels =
                             {
@@ -310,7 +310,7 @@
                     }
                     else
                     {
-                        execute_window_loop(window, [&](const Coordinates & id)
+                        execute_window_loop(window, [&](const Coordinates &)
                         {
                             const int16x8x2_t texels =
                             {
@@ -332,7 +332,7 @@
                     const int32x4_t b = vdupq_n_s32(_shift);
 
                     /* Up-conversion S16 -> S32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const int16x8x2_t texels =
                         {
@@ -376,7 +376,7 @@
                     /* Down-conversion U16 -> U8 */
                     if(ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(window, [&](const Coordinates & id)
+                        execute_window_loop(window, [&](const Coordinates &)
                         {
                             const uint16x8x2_t texels =
                             {
@@ -392,7 +392,7 @@
                     }
                     else
                     {
-                        execute_window_loop(window, [&](const Coordinates & id)
+                        execute_window_loop(window, [&](const Coordinates &)
                         {
                             const uint16x8x2_t texels =
                             {
@@ -413,7 +413,7 @@
                     const int32x4_t b = vdupq_n_s32(_shift);
 
                     /* Up-conversion U16 -> U32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const uint16x8x2_t texels =
                         {
@@ -504,7 +504,7 @@
                     const int32x4_t   zero_val_vec = vdupq_n_s32(0);
 
                     /* Down-conversion F32 -> QASYMM8 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const float32x4x4_t texels =
                         {
@@ -535,7 +535,7 @@
                     const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
 
                     /* Down-conversion F32 -> F16 */
-                    execute_window_loop(window, [&](const Coordinates & id)
+                    execute_window_loop(window, [&](const Coordinates &)
                     {
                         const float32x4x4_t texels =
                         {
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 6071153..fdafc2d 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -40,11 +40,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-using namespace arm_compute::detail;
-using namespace arm_compute::misc::shape_calculator;
-using namespace depthwise;
-
+namespace arm_compute
+{
 namespace
 {
 template <typename T1, typename T2, unsigned int stridex>
@@ -52,7 +49,7 @@
 {
 public:
     static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
     {
         const int input_offset   = -input->info()->quantization_info().offset;
         const int weights_offset = -weights->info()->quantization_info().offset;
@@ -60,12 +57,13 @@
         const int          input_stride_x  = input->info()->strides_in_bytes().x();
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
         const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          input_stride_w  = input->info()->strides_in_bytes()[3];
         const int          output_stride_y = output->info()->strides_in_bytes().y();
         const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
         const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
         const int          output_w        = output->info()->dimension(0);
         const int          output_h        = output->info()->dimension(1);
-        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          delta_input     = detail::get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
         const unsigned int conv_pad_x      = conv_info.pad_left();
         const unsigned int conv_pad_y      = conv_info.pad_top();
@@ -77,9 +75,10 @@
 
         // setup input window for the iterator
         Window window_in = window;
-        // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
+        // Iteration of input is taken care of in execute_window_loop
         window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
         window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Window window_k = calculate_max_window(*weights->info(), Steps(1u));
 
@@ -94,58 +93,67 @@
             int ih = 0;
             int oh = 0;
 
-            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y - (id.z() - id.z() / depth_multiplier) * input_stride_z;
+            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y + (id.z() / depth_multiplier) * input_stride_z + input_stride_w * id[3];
             const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
 
             const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
             const auto ptr_weights_r1 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y);
             const auto ptr_weights_r2 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y * 2);
-            const auto vw_r0          = load_matrix_row(ptr_weights_r0, weights_offset);
-            const auto vw_r1          = load_matrix_row(ptr_weights_r1, weights_offset);
-            const auto vw_r2          = load_matrix_row(ptr_weights_r2, weights_offset);
+            const auto vw_r0          = detail::load_matrix_row(ptr_weights_r0, weights_offset);
+            const auto vw_r1          = detail::load_matrix_row(ptr_weights_r1, weights_offset);
+            const auto vw_r2          = detail::load_matrix_row(ptr_weights_r2, weights_offset);
 
             for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
             {
                 auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
-                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + 1) * input_stride_y);
-                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2) * input_stride_y);
-                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);
+                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
+                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); //uint8
+                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);                           //int32
 
                 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                     in_top += delta_input, in_mid += delta_input, in_low += delta_input,
                     p_out += num_elems_written_per_iteration)
                 {
-                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
-                    store_results<stridex>(p_out, vres);
+                    if(dilation == Size2D(1U, 1U))
+                    {
+                        auto vres = detail::convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
+                        detail::store_results<stridex>(p_out, vres);
+                    }
+                    else
+                    {
+                        auto vres = detail::convolve_3x3_dilation<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), input_offset);
+                        detail::store_results<stridex>(p_out, vres);
+                    }
                 }
             }
         },
-        in, out);
+        out);
     }
 };
 
 template <typename T1, typename T2>
 inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+                         const ITensor *input, const ITensor *weights, ITensor *output,
+                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     switch(conv_stride_x)
     {
         case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -156,15 +164,11 @@
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
-
-    if(!is_optimized)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
 
         if(is_data_type_quantized_asymmetric(input->data_type()))
@@ -180,95 +184,63 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized,
-                                                        IDepthwiseConvolution *convolver = nullptr)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                        const Size2D &dilation)
 {
     Window win;
     bool   window_changed = false;
 
-    if(is_optimized)
+    // Get convolved dimensions
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+    const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+
+    // Configure kernel window (generic)
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    const unsigned int conv_stride_y = conv_info.stride().second;
+    const unsigned int conv_pad_top  = conv_info.pad_top();
+    const unsigned int conv_pad_left = conv_info.pad_left();
+
+    unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+    unsigned int num_elems_read_per_iteration    = 0;
+
+    switch(input->data_type())
     {
-        if(convolver != nullptr)
-        {
-            auto win_last = convolver->get_window();
-            win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
-            // Auto-configure output
-            bool        same_padding = conv_info.has_padding();
-            TensorShape output_shape{ input->tensor_shape() };
-
-            output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
-            output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
-
-            const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-
-            // Output auto inizialitation if not yet initialized
-            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
-            // Configure window (optimised)
-            // Set padding in channels
-            const int num_channels = weights->dimension(0);
-            if((num_channels >= 128) && (num_channels % 16 == 0))
-            {
-                input->extend_padding(PaddingSize(0, 4, 0, 0));
-                weights->extend_padding(PaddingSize(0, 4, 0, 0));
-                output->extend_padding(PaddingSize(0, 4, 0, 0));
-            }
-        }
-    }
-    else
-    {
-        // Get convolved dimensions
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
-        const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
-        // Configure kernel window (generic)
-        const unsigned int conv_stride_x = conv_info.stride().first;
-        const unsigned int conv_stride_y = conv_info.stride().second;
-        const unsigned int conv_pad_top  = conv_info.pad_top();
-        const unsigned int conv_pad_left = conv_info.pad_left();
-
-        unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-        unsigned int num_elems_read_per_iteration    = 0;
-
-        switch(input->data_type())
-        {
-            case DataType::QASYMM8:
-                num_elems_read_per_iteration = 16;
-                break;
+        case DataType::QASYMM8:
+            num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
+            break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                num_elems_read_per_iteration = 24;
-                break;
+        case DataType::F16:
+            num_elems_written_per_iteration = 32 >> conv_stride_x;
+            num_elems_read_per_iteration    = 24 + 23 * (dilation.x() - 1);
+            break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-                num_elems_read_per_iteration = 12;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported.");
-        }
-
-        // Configure kernel window
-        win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
-        AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3, conv_stride_x, conv_stride_y);
-        AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
-        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        case DataType::F32:
+            num_elems_read_per_iteration = 12 + 11 * (dilation.x() - 1);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported.");
     }
 
+    // Configure kernel window
+    win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+    AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3 + 2 * (dilation.y() - 1), conv_stride_x, conv_stride_y);
+    AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
+    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+
+    window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false), _depth_multiplier(1)
+    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1), _dilation()
 {
 }
 
@@ -278,33 +250,40 @@
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                                     DataLayout data_layout)
+                                                     const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, dilation));
 
     _input            = input;
     _output           = output;
     _weights          = weights;
     _conv_info        = conv_info;
     _depth_multiplier = depth_multiplier;
-    _convolver        = nullptr;
-
-    _run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
-                                                                                           conv_info,
-                                                                                           input->info()->data_type(), depth_multiplier,
-                                                                                           data_layout);
-
-    (_run_optimized) ? configure_optimized() : configure_generic();
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+        case DataType::F32:
+            _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
+            break;
+        case DataType::F16:
+            _num_elems_written_per_iteration = 32 >> _conv_info.stride().first;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported.");
+    }
+    _border_size    = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
+    _dilation       = dilation;
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, dilation);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
-Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                      const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-
-    bool is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->tensor_shape(), conv_info, input->data_type(), depth_multiplier, input->data_layout());
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, is_optimized));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, is_optimized).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, dilation));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, dilation).first);
     return Status{};
 }
 
@@ -313,213 +292,23 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_UNUSED(info);
 
-    (_run_optimized) ? run_optimized(window, info) : run_generic(window, info);
-}
-
-bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier, DataLayout data_layout)
-{
-    // Reshape input shape if in NHWC format
-    TensorShape in_shape{ input_shape };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, input_shape.y());
-        in_shape.set(Window::DimY, input_shape.z());
-        in_shape.set(Window::DimZ, input_shape.x());
-    }
-
-    // Check supported data type
-    bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
-
-    // Check for supported strides
-    const auto &strides           = conv_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = conv_info.pad_top();
-    const auto    pad_right         = conv_info.pad_right();
-    const auto    pad_bottom        = conv_info.pad_bottom();
-    const auto    pad_left          = conv_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-
-    return supported_datatype && supported_strides && supported_padding && (depth_multiplier == 1);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
-{
-    ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(_input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
-    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
-
-    _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
-    if(_convolver)
-    {
-        _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
-
-    _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
-    _border_size                     = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
-
-    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, false);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized()
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
-
-    _border_size = BorderSize(0, 0);
-    _convolver   = create_convolver_object(_conv_info, _weights, _input, _output);
-
-    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, true, _convolver.get());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info)
-{
     ARM_COMPUTE_UNUSED(info);
 
     switch(_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
         case DataType::QASYMM8:
-            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-
-void NEDepthwiseConvolutionLayer3x3Kernel::run_optimized(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON(!_convolver);
-
-    const size_t start = window.x().start();
-    const size_t end   = window.x().end();
-    _convolver->run(start, end);
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(PadStrideInfo  conv_info,
-                                                                                                                const ITensor *w,
-                                                                                                                const ITensor *in,
-                                                                                                                ITensor       *out,
-                                                                                                                bool           setup_strides)
-{
-    const DataType    dt                  = in->info()->data_type();
-    const TensorShape shape               = in->info()->tensor_shape();
-    const int         in_rows             = shape.z();
-    const int         in_cols             = shape.y();
-    const int         n_batches           = shape[3];
-    const int         n_channels          = shape.x();
-    const bool        padding_same        = conv_info.has_padding();
-    const int         weight_col_stride   = (setup_strides) ? w->info()->strides_in_bytes().y() / w->info()->element_size() : 0;
-    const int         weight_row_stride   = (setup_strides) ? w->info()->strides_in_bytes().z() / w->info()->element_size() : 0;
-    const int         input_col_stride    = (setup_strides) ? in->info()->strides_in_bytes().y() / in->info()->element_size() : 0;
-    const int         input_row_stride    = (setup_strides) ? in->info()->strides_in_bytes().z() / in->info()->element_size() : 0;
-    const int         input_batch_stride  = (setup_strides) ? in->info()->strides_in_bytes()[3] / in->info()->element_size() : 0;
-    const int         output_col_stride   = (setup_strides) ? out->info()->strides_in_bytes().y() / out->info()->element_size() : 0;
-    const int         output_row_stride   = (setup_strides) ? out->info()->strides_in_bytes().z() / out->info()->element_size() : 0;
-    const int         output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
-
-    const auto stride_x = conv_info.stride().first;
-    switch(dt)
-    {
-        case DataType::QASYMM8:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
-                               in->ptr_to_element(Coordinates()),
-                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
-                               in->ptr_to_element(Coordinates()),
-                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                default:
-                    return nullptr;
-            }
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                default:
-                    return nullptr;
-            }
-            break;
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, padding_same,
-                               reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
-                               reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
-                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
-                               output_col_stride, output_row_stride, output_batch_stride);
-                default:
-                    return nullptr;
-            }
-            break;
-        }
-        default:
-            return nullptr;
-    }
-}
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 62373e3..88f8b31 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -38,7 +38,8 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(conv_info);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -48,6 +49,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || dilation.y() < 1);
 
     return Status{};
 }
@@ -84,7 +86,7 @@
     Iterator out(_output, window_out);
 
     const int full_length   = input_w + pad_left + pad_right;
-    const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
+    const int max_initial_x = stride_x * (((full_length - (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1))) / stride_x) + 1);
 
     // Define pad value
     auto zero = static_cast<T>(0);
@@ -103,12 +105,12 @@
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr() + id.z() / _depth_multiplier * input_stride_z;
         auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
-        const int            height     = src_y + _kernel_dims.height;
-        const int            width      = src_x + _kernel_dims.width;
+        const int            height     = src_y + (_kernel_dims.height + (_kernel_dims.height - 1) * (_dilation.y() - 1));
+        const int            width      = src_x + (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1));
 
-        for(int y = src_y; y < height; ++y)
+        for(int y = src_y; y < height; y += _dilation.y())
         {
-            for(int x = src_x; x < width; ++x, ++output_ptr)
+            for(int x = src_x; x < width; x += _dilation.x(), ++output_ptr)
             {
                 if(x < 0 || x >= input_w || y < 0 || y >= input_h)
                 {
@@ -130,15 +132,16 @@
 }
 
 NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1), _dilation()
 {
 }
 
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                        const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
 
     _input            = input;
     _output           = output;
@@ -146,6 +149,7 @@
     _conv_info        = conv_info;
     _has_bias         = has_bias;
     _depth_multiplier = depth_multiplier;
+    _dilation         = dilation;
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -172,10 +176,11 @@
     INEKernel::configure(win);
 }
 
-Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+                                         const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
     return Status{};
 }
 
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 47c895c..1520225 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,83 +24,143 @@
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
 
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    // NEDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    // Configure window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+    return std::make_tuple(Status{}, win);
+}
 
-    // Update window and padding
-    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+template <typename T>
+inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
 
-    output_access.set_valid_region(win, input->valid_region());
+template <>
+inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+    wrapper::vstore(ptr + 8, v.val[2]);
+    wrapper::vstore(ptr + 12, v.val[3]);
+}
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T>
+void run_dequantization(const ITensor *input, ITensor *output, const Window &window)
+{
+    const QuantizationInfo &qinfo = input->info()->quantization_info();
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const uint8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, qinfo);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            uint8_t val    = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(qinfo.dequantize(val));
+        }
+    },
+    in, out);
 }
 } // namespace
 
 NEDequantizationLayerKernel::NEDequantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+    : _input(nullptr), _output(nullptr)
 {
 }
 
-void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
-    _input   = input;
-    _output  = output;
-    _min_max = min_max;
+    _input  = input;
+    _output = output;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
+    auto win_config = validate_and_configure_window(input->info(), output->info());
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
-
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
     return Status{};
 }
 
@@ -110,53 +170,18 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window window_input_output(window);
-    window_input_output.set(3, Window::Dimension(0, 1, 1));
-
-    Window window_min_max;
-    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
-    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, window_input_output);
-    Iterator output(_output, window_input_output);
-    Iterator min_max(_min_max, window_min_max);
-
-    execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+    switch(_output->info()->data_type())
     {
-        // Get the min and max
-        const float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
-        const float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
-
-        const float32x4_t vmin    = vdupq_n_f32(min);
-        const float       range   = max - min;
-        const float32x4_t scaling = vdupq_n_f32(range / 255.0f);
-
-        // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
-        execute_window_loop(window_input_output, [&](const Coordinates & id)
-        {
-            // Get the input values
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
-
-            const uint8x8_t  val_u8       = vld1_u8(input_ptr);
-            const uint16x8_t val_u16      = vmovl_u8(val_u8);
-            const uint32x4_t val_u32_low  = vmovl_u16(vget_low_u16(val_u16));
-            const uint32x4_t val_u32_high = vmovl_u16(vget_high_u16(val_u16));
-            float32x4_t      val_low      = vcvtq_f32_u32(val_u32_low);
-            float32x4_t      val_high     = vcvtq_f32_u32(val_u32_high);
-
-            // Dequantize -> (q / 255.0 * range) + min
-            val_low  = vmulq_f32(val_low, scaling);
-            val_high = vmulq_f32(val_high, scaling);
-            val_low  = vaddq_f32(val_low, vmin);
-            val_high = vaddq_f32(val_high, vmin);
-
-            const float32x4x2_t dequantized = vuzpq_f32(val_low, val_high);
-
-            // Store the dequantized values
-            auto output_ptr = reinterpret_cast<float *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
-            vst2q_f32(output_ptr, dequantized);
-        },
-        input, output);
-    },
-    min_max);
-}
\ No newline at end of file
+        case DataType::F32:
+            run_dequantization<float>(_input, _output, window);
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            run_dequantization<float16_t>(_input, _output, window);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index cfed324..1d7237a 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -124,7 +124,7 @@
     Iterator out_x(_output_x, window);
 
     /* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         /* Load left and right data */
         const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
@@ -153,7 +153,7 @@
     const size_t stride = _input->info()->strides_in_bytes()[1];
 
     /* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         /* Load top and bottom data */
         const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
@@ -183,7 +183,7 @@
     const size_t stride = _input->info()->strides_in_bytes()[1];
 
     /* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         /* Load top, bottom, left and right data */
         const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index 3ee00a4..e761815 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,7 +94,8 @@
         uint8x8_t bot_high_data = vget_high_u8(bot_data);
         uint8x8_t bot_low_data  = vget_low_u8(bot_data);
 
-        uint8x8_t p0, p1;
+        uint8x8_t p0;
+        uint8x8_t p1;
 
         p0 = top_low_data;
         p1 = vext_u8(top_low_data, top_high_data, 1);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 162c4b1..d557cfa 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -192,12 +192,12 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr                       = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr                         = out.ptr();
-            int            ih                              = 0;
-            int            oh                              = 0;
-            float32x4_t    accum0[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            float32x4_t    accum1[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
             for(int oz = 0; oz < range_z; ++oz)
             {
                 accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 09836f1..7e11393 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -266,7 +266,7 @@
 
     if(in_place) // In place accumulate
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             // Get bias and pointer to input
             const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
@@ -287,7 +287,7 @@
     else // Out of place accumulate
     {
         Iterator out(output, window);
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             // Get bias and pointer to input
             const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
@@ -363,7 +363,7 @@
 
     Iterator in(input, window);
     Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         // Get bias and pointer to input
         const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr());
@@ -399,7 +399,7 @@
     Iterator bi(bias, window_bias);
 
     Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         // Get bias and pointer to input
         const auto in_ptr   = reinterpret_cast<int32_t *>(in.ptr());
@@ -433,7 +433,7 @@
 
     Iterator in(input, window);
     Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         // Get pointer to input
         const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index aa458c2..6b87ea0 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -506,7 +506,7 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
             const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
@@ -531,7 +531,7 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
             const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
@@ -599,7 +599,7 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
             const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
@@ -640,7 +640,7 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
             const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 7ecc4d1..34696d8 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -87,7 +87,7 @@
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
         const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 88c20f8..2a538ec 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,7 +94,8 @@
         uint8x8_t bot_high_data = vget_high_u8(bot_data);
         uint8x8_t bot_low_data  = vget_low_u8(bot_data);
 
-        uint8x8_t p0, p1;
+        uint8x8_t p0;
+        uint8x8_t p1;
 
         p0 = top_low_data;
         p1 = vext_u8(top_low_data, top_high_data, 1);
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
new file mode 100644
index 0000000..cf77345
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_UNUSED(idx, config);
+
+    auto_init_if_empty(*output, input->clone()->set_num_channels(2));
+
+    Window win = calculate_max_window(*input, Steps());
+    input->set_valid_region(ValidRegion(Coordinates(), input->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+{
+}
+
+void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
+
+    _input  = input;
+    _output = output;
+    _idx    = idx;
+
+    const size_t axis             = config.axis;
+    const bool   is_conj          = config.conjugate;
+    const bool   is_input_complex = (input->info()->num_channels() == 2);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), config);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    if(axis == 0)
+    {
+        if(is_input_complex)
+        {
+            if(is_conj)
+            {
+                _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
+            }
+            else
+            {
+                _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, false>;
+            }
+        }
+        else
+        {
+            _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
+        }
+    }
+    else if(axis == 1)
+    {
+        if(is_input_complex)
+        {
+            if(is_conj)
+            {
+                _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
+            }
+            else
+            {
+                _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, false>;
+            }
+        }
+        else
+        {
+            _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<false, false>;
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    return Status{};
+}
+
+template <bool is_input_complex, bool is_conj>
+void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
+{
+    const size_t N = _input->info()->dimension(0);
+
+    // Copy the look-up buffer to a local array
+    std::vector<unsigned int> buffer_idx(N);
+    std::copy_n(reinterpret_cast<unsigned int *>(_idx->buffer()), N, buffer_idx.data());
+
+    // Input/output iterators
+    Window slice = window;
+    slice.set(0, Window::DimX);
+    Iterator in(_input, slice);
+    Iterator out(_output, slice);
+
+    // Row buffers
+    std::vector<float> buffer_row_out(2 * N);
+    std::vector<float> buffer_row_in(2 * N);
+
+    execute_window_loop(slice, [&](const Coordinates &)
+    {
+        if(is_input_complex)
+        {
+            // Load
+            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
+
+            // Shuffle
+            for(size_t x = 0; x < 2 * N; x += 2)
+            {
+                size_t idx            = buffer_idx[x / 2];
+                buffer_row_out[x]     = buffer_row_in[2 * idx];
+                buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+            }
+        }
+        else
+        {
+            // Load
+            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+            // Shuffle
+            for(size_t x = 0; x < N; ++x)
+            {
+                size_t idx            = buffer_idx[x];
+                buffer_row_out[2 * x] = buffer_row_in[idx];
+            }
+        }
+
+        // Copy back
+        memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+    },
+    in, out);
+}
+
+template <bool is_input_complex, bool is_conj>
+void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
+{
+    const size_t Nx = _input->info()->dimension(0);
+    const size_t Ny = _input->info()->dimension(1);
+
+    // Copy the look-up buffer to a local array
+    std::vector<unsigned int> buffer_idx(Ny);
+    std::copy_n(reinterpret_cast<unsigned int *>(_idx->buffer()), Ny, buffer_idx.data());
+
+    // Output iterator
+    Window slice = window;
+    slice.set(0, Window::DimX);
+    Iterator out(_output, slice);
+
+    // Row buffer
+    std::vector<float> buffer_row(Nx);
+
+    // Strides
+    const size_t stride_z = _input->info()->strides_in_bytes()[2];
+    const size_t stride_w = _input->info()->strides_in_bytes()[3];
+
+    execute_window_loop(slice, [&](const Coordinates & id)
+    {
+        auto        *out_ptr    = reinterpret_cast<float *>(out.ptr());
+        auto        *in_ptr     = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+        const size_t y_shuffled = buffer_idx[id.y()];
+
+        if(is_input_complex)
+        {
+            // Shuffle the entire row into the output
+            memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+            // Conjugate if necessary
+            if(is_conj)
+            {
+                for(size_t x = 0; x < 2 * Nx; x += 2)
+                {
+                    out_ptr[x + 1] = -out_ptr[x + 1];
+                }
+            }
+        }
+        else
+        {
+            // Shuffle the entire row into the buffer
+            memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+            // Copy the buffer to the output, with a zero imaginary part
+            for(size_t x = 0; x < 2 * Nx; x += 2)
+            {
+                out_ptr[x] = buffer_row[x / 2];
+            }
+        }
+    },
+    out);
+}
+
+void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_UNUSED(info);
+    (this->*_func)(window);
+}
+
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
new file mode 100644
index 0000000..148bbe9
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <complex>
+#include <map>
+
+#include "arm_compute/core/NEON/wrapper/traits.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace
+{
+// PI constant (from cmath)
+constexpr float kPi = float(M_PI);
+
+// Constant used in the fft_3 kernel
+constexpr float kSqrt3Div2 = 0.866025403784438;
+
+// Constants used in the fft_5 kernel
+constexpr float kW5_0 = 0.30901699437494f;
+constexpr float kW5_1 = 0.95105651629515f;
+constexpr float kW5_2 = 0.80901699437494f;
+constexpr float kW5_3 = 0.58778525229247f;
+
+// Constants used in the fft_7 kernel
+constexpr float kW7_0 = 0.62348980185873f;
+constexpr float kW7_1 = 0.78183148246802f;
+constexpr float kW7_2 = 0.22252093395631f;
+constexpr float kW7_3 = 0.97492791218182f;
+constexpr float kW7_4 = 0.90096886790241f;
+constexpr float kW7_5 = 0.43388373911755f;
+
+// Constant used in the fft_8 kernel
+constexpr float kSqrt2Div2 = 0.707106781186548;
+
+float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
+
+    const float32x2_t mask = { -1.0, 1.0 };
+    const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+    const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+
+    float32x2_t res = wrapper::vmul(tmp0, b);
+
+    b   = wrapper::vrev64(b);
+    b   = wrapper::vmul(b, mask);
+    res = wrapper::vmla(res, tmp1, b);
+
+    return res;
+}
+
+float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
+{
+    const float a_r = wrapper::vgetlane(a, 0);
+    const float a_i = wrapper::vgetlane(a, 1);
+
+    const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+    return out;
+}
+
+float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)
+{
+    const auto t0 = wrapper::vadd(a, b);
+    const auto t1 = wrapper::vadd(c, d);
+    const auto t2 = wrapper::vadd(t0, t1);
+    return wrapper::vadd(t2, e);
+}
+
+float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+{
+    const auto t0  = wrapper::vadd(x1, x2);
+    const auto t1  = wrapper::vadd(x3, x4);
+    const auto t2  = wrapper::vadd(x5, x6);
+    const auto t00 = wrapper::vadd(t0, t1);
+    const auto t01 = wrapper::vadd(t2, x7);
+
+    return wrapper::vadd(t00, t01);
+}
+
+float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+{
+    const auto t0  = wrapper::vadd(x1, x2);
+    const auto t1  = wrapper::vadd(x3, x4);
+    const auto t2  = wrapper::vadd(x5, x6);
+    const auto t3  = wrapper::vadd(x7, x8);
+    const auto t00 = wrapper::vadd(t0, t1);
+    const auto t01 = wrapper::vadd(t2, t3);
+
+    return wrapper::vadd(t00, t01);
+}
+
+void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w)
+{
+    float32x2_t a = x;
+    float32x2_t b = c_mul_neon(w, y);
+
+    x = wrapper::vadd(a, b);
+    y = wrapper::vsub(a, b);
+}
+
+void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2)
+{
+    float32x2_t a = x;
+    float32x2_t b = c_mul_neon(w, y);
+    float32x2_t c = c_mul_neon(w2, z);
+
+    x = wrapper::vadd(a, b);
+    x = wrapper::vadd(x, c);
+
+    const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
+    const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+
+    y = z = wrapper::vsub(a, v1);
+    y     = wrapper::vadd(y, v2);
+    z     = wrapper::vsub(z, v2);
+}
+
+void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+{
+    float32x2_t a = x1;
+    float32x2_t b = c_mul_neon(w, x2);
+    float32x2_t c = c_mul_neon(w2, x3);
+    float32x2_t d = c_mul_neon(w3, x4);
+
+    const auto x11 = wrapper::vadd(a, b);
+    const auto x12 = wrapper::vadd(c, d);
+    x1             = wrapper::vadd(x11, x12);
+
+    const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1));
+    const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f));
+    x2             = wrapper::vadd(x21, x22);
+
+    const auto x31 = wrapper::vadd(a, wrapper::vneg(b));
+    const auto x32 = wrapper::vadd(c, wrapper::vneg(d));
+    x3             = wrapper::vadd(x31, x32);
+
+    const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1));
+    const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1));
+    x4             = wrapper::vadd(x41, x42);
+}
+
+void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+{
+    const auto a = x1;
+    const auto b = c_mul_neon(w, x2);
+    const auto c = c_mul_neon(w2, x3);
+    const auto d = c_mul_neon(w3, x4);
+    const auto e = c_mul_neon(w4, x5);
+
+    const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
+    const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
+    const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
+    const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
+    const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
+    const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
+    const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
+    const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
+    const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
+    const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
+    const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
+    const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
+    const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+
+    x1 = reduce_sum_5(a, b, c, d, e);
+    x2 = reduce_sum_5(a, b0, c0, d0, e0);
+    x3 = reduce_sum_5(a, b1, c1, d1, e1);
+    x4 = reduce_sum_5(a, b2, c2, d2, e2);
+    x5 = reduce_sum_5(a, b3, c3, d3, e3);
+}
+
+void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+           const float32x2_t &w4,
+           const float32x2_t &w5, const float32x2_t &w6)
+{
+    const auto a = x1;
+    const auto b = c_mul_neon(w, x2);
+    const auto c = c_mul_neon(w2, x3);
+    const auto d = c_mul_neon(w3, x4);
+    const auto e = c_mul_neon(w4, x5);
+    const auto f = c_mul_neon(w5, x6);
+    const auto g = c_mul_neon(w6, x7);
+
+    const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
+    const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
+    const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
+    const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
+    const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
+    const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
+    const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
+    const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
+    const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
+    const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
+    const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
+    const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
+    const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
+    const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
+    const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
+    const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
+    const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
+    const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
+    const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
+    const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
+    const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
+    const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
+    const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
+    const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
+    const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
+    const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
+    const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
+    const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
+    const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
+    const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
+    const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+
+    x1 = reduce_sum_7(a, b, c, d, e, f, g);
+    x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
+    x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
+    x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
+    x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
+    x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
+    x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
+}
+
+void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+           const float32x2_t &w3,
+           const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+           const float32x2_t &w7)
+{
+    const auto a = x1;
+    const auto b = c_mul_neon(w, x2);
+    const auto c = c_mul_neon(w2, x3);
+    const auto d = c_mul_neon(w3, x4);
+    const auto e = c_mul_neon(w4, x5);
+    const auto f = c_mul_neon(w5, x6);
+    const auto g = c_mul_neon(w6, x7);
+    const auto h = c_mul_neon(w7, x8);
+
+    const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
+    const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
+    const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
+    const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
+    const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
+    const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
+    const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+    const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+    const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+    const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
+    const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+    const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+    const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
+    const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
+    const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
+    const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
+    const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
+    const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
+    const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+    const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+    const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+    const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+    const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+    const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+    const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
+    const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
+    const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
+    const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
+    const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
+    const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
+    const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+    const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+    const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+    const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
+    const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+    const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+    const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+
+    const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
+    const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
+    const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
+    const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
+    const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
+    const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
+    const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+
+    x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
+    x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
+    x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
+    x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
+    x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
+    x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
+    x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
+    x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
+}
+
+template <bool first_stage>
+void fft_radix_2_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            auto a = float32x2_t{ 0, 0 };
+            auto b = float32x2_t{ 0, 0 };
+
+            // Load inputs
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                a             = wrapper::vgetlow(ab);
+                b             = wrapper::vgethigh(ab);
+            }
+            else
+            {
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+            }
+
+            // Base-case prime transform
+            fft_2(a, b, w);
+
+            // Write outputs
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+            }
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_2_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+
+            // Base-case prime transform
+            fft_2(a, b, w);
+
+            // Write outputs
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+template <bool first_stage>
+void fft_radix_3_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const auto w2 = c_mul_neon(w, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = { 0, 0 };
+            float32x2_t b = { 0, 0 };
+            float32x2_t c = { 0, 0 };
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                a             = wrapper::vgetlow(ab);
+                b             = wrapper::vgethigh(ab);
+            }
+            else
+            {
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+            }
+            c = wrapper::vload(x + k + 4 * Nx);
+
+            // Base-case prime transform
+            fft_3(a, b, c, w, w2);
+
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+            }
+            wrapper::vstore(X + k + 4 * Nx, c);
+        }
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_3_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const auto w2 = c_mul_neon(w, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+            float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+
+            // Base-case prime transform
+            fft_3(a, b, c, w, w2);
+
+            // Store the output
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+            wrapper::vstore(X + M * (k + 4 * Nx), c);
+        }
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+template <bool first_stage>
+void fft_radix_4_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const auto w2 = c_mul_neon(w, w);
+        const auto w3 = c_mul_neon(w2, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            float32x2_t a = { 0, 0 };
+            float32x2_t b = { 0, 0 };
+            float32x2_t c = { 0, 0 };
+            float32x2_t d = { 0, 0 };
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+                a             = wrapper::vgetlow(ab);
+                b             = wrapper::vgethigh(ab);
+                c             = wrapper::vgetlow(cd);
+                d             = wrapper::vgethigh(cd);
+            }
+            else
+            {
+                // Load inputs
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+                c = wrapper::vload(x + k + 4 * Nx);
+                d = wrapper::vload(x + k + 6 * Nx);
+            }
+
+            // Base-case prime transform
+            fft_4(a, b, c, d, w, w2, w3);
+
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+                wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+                wrapper::vstore(X + k + 4 * Nx, c);
+                wrapper::vstore(X + k + 6 * Nx, d);
+            }
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_4_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const auto w2 = c_mul_neon(w, w);
+        const auto w3 = c_mul_neon(w2, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+            float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+            float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+
+            // Base-case prime transform
+            fft_4(a, b, c, d, w, w2, w3);
+
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+            wrapper::vstore(X + M * (k + 4 * Nx), c);
+            wrapper::vstore(X + M * (k + 6 * Nx), d);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+template <bool first_stage>
+void fft_radix_5_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            float32x2_t a = { 0, 0 };
+            float32x2_t b = { 0, 0 };
+            float32x2_t c = { 0, 0 };
+            float32x2_t d = { 0, 0 };
+            float32x2_t e = { 0, 0 };
+
+            // Load inputs
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+
+                a = wrapper::vgetlow(ab);
+                b = wrapper::vgethigh(ab);
+                c = wrapper::vgetlow(cd);
+                d = wrapper::vgethigh(cd);
+            }
+            else
+            {
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+                c = wrapper::vload(x + k + 4 * Nx);
+                d = wrapper::vload(x + k + 6 * Nx);
+            }
+            e = wrapper::vload(x + k + 8 * Nx);
+
+            // Base-case prime transform
+            fft_5(a, b, c, d, e, w, w2, w3, w4);
+
+            // Store outputs
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+                wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+                wrapper::vstore(X + k + 4 * Nx, c);
+                wrapper::vstore(X + k + 6 * Nx, d);
+            }
+            wrapper::vstore(X + k + 8 * Nx, e);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_5_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+            float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+            float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+            float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+
+            // Base-case prime transform
+            fft_5(a, b, c, d, e, w, w2, w3, w4);
+
+            // Store outputs
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+            wrapper::vstore(X + M * (k + 4 * Nx), c);
+            wrapper::vstore(X + M * (k + 6 * Nx), d);
+            wrapper::vstore(X + M * (k + 8 * Nx), e);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+template <bool first_stage>
+void fft_radix_7_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+        const float32x2_t w5 = c_mul_neon(w4, w);
+        const float32x2_t w6 = c_mul_neon(w5, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            float32x2_t a = { 0, 0 };
+            float32x2_t b = { 0, 0 };
+            float32x2_t c = { 0, 0 };
+            float32x2_t d = { 0, 0 };
+            float32x2_t e = { 0, 0 };
+            float32x2_t f = { 0, 0 };
+            float32x2_t g = { 0, 0 };
+
+            // Load inputs
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+                const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+
+                a = wrapper::vgetlow(ab);
+                b = wrapper::vgethigh(ab);
+                c = wrapper::vgetlow(cd);
+                d = wrapper::vgethigh(cd);
+                e = wrapper::vgetlow(ef);
+                f = wrapper::vgethigh(ef);
+            }
+            else
+            {
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+                c = wrapper::vload(x + k + 4 * Nx);
+                d = wrapper::vload(x + k + 6 * Nx);
+                e = wrapper::vload(x + k + 8 * Nx);
+                f = wrapper::vload(x + k + 10 * Nx);
+            }
+            g = wrapper::vload(x + k + 12 * Nx);
+
+            // Base-case prime transform
+            fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
+
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+                wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+                wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+                wrapper::vstore(X + k + 4 * Nx, c);
+                wrapper::vstore(X + k + 6 * Nx, d);
+                wrapper::vstore(X + k + 8 * Nx, e);
+                wrapper::vstore(X + k + 10 * Nx, f);
+            }
+            wrapper::vstore(X + k + 12 * Nx, g);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_7_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+        const float32x2_t w5 = c_mul_neon(w4, w);
+        const float32x2_t w6 = c_mul_neon(w5, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+            float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+            float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+            float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+            float32x2_t f = wrapper::vload(x + M * (k + 10 * Nx));
+            float32x2_t g = wrapper::vload(x + M * (k + 12 * Nx));
+
+            // Base-case prime transform
+            fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
+
+            // Store outputs
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+            wrapper::vstore(X + M * (k + 4 * Nx), c);
+            wrapper::vstore(X + M * (k + 6 * Nx), d);
+            wrapper::vstore(X + M * (k + 8 * Nx), e);
+            wrapper::vstore(X + M * (k + 10 * Nx), f);
+            wrapper::vstore(X + M * (k + 12 * Nx), g);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+template <bool first_stage>
+void fft_radix_8_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+        const float32x2_t w5 = c_mul_neon(w4, w);
+        const float32x2_t w6 = c_mul_neon(w5, w);
+        const float32x2_t w7 = c_mul_neon(w6, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = { 0, 0 };
+            float32x2_t b = { 0, 0 };
+            float32x2_t c = { 0, 0 };
+            float32x2_t d = { 0, 0 };
+            float32x2_t e = { 0, 0 };
+            float32x2_t f = { 0, 0 };
+            float32x2_t g = { 0, 0 };
+            float32x2_t h = { 0, 0 };
+
+            // Base-case prime transform
+            if(first_stage)
+            {
+                const auto ab = wrapper::vloadq(x + k);
+                const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+                const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+                const auto gh = wrapper::vloadq(x + k + 12 * Nx);
+
+                a = wrapper::vgetlow(ab);
+                b = wrapper::vgethigh(ab);
+                c = wrapper::vgetlow(cd);
+                d = wrapper::vgethigh(cd);
+                e = wrapper::vgetlow(ef);
+                f = wrapper::vgethigh(ef);
+                g = wrapper::vgetlow(gh);
+                h = wrapper::vgethigh(gh);
+            }
+            else
+            {
+                a = wrapper::vload(x + k);
+                b = wrapper::vload(x + k + 2 * Nx);
+                c = wrapper::vload(x + k + 4 * Nx);
+                d = wrapper::vload(x + k + 6 * Nx);
+                e = wrapper::vload(x + k + 8 * Nx);
+                f = wrapper::vload(x + k + 10 * Nx);
+                g = wrapper::vload(x + k + 12 * Nx);
+                h = wrapper::vload(x + k + 14 * Nx);
+            }
+
+            // Apply twiddle factors
+            fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
+
+            // Store outputs
+            if(first_stage)
+            {
+                wrapper::vstore(X + k, wrapper::vcombine(a, b));
+                wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+                wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+                wrapper::vstore(X + k + 12 * Nx, wrapper::vcombine(g, h));
+            }
+            else
+            {
+                wrapper::vstore(X + k, a);
+                wrapper::vstore(X + k + 2 * Nx, b);
+                wrapper::vstore(X + k + 4 * Nx, c);
+                wrapper::vstore(X + k + 6 * Nx, d);
+                wrapper::vstore(X + k + 8 * Nx, e);
+                wrapper::vstore(X + k + 10 * Nx, f);
+                wrapper::vstore(X + k + 12 * Nx, g);
+                wrapper::vstore(X + k + 14 * Nx, h);
+            }
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+void fft_radix_8_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+    float32x2_t w{ 1.0f, 0.0f };
+    for(unsigned int j = 0; j < Nx; j++)
+    {
+        const float32x2_t w2 = c_mul_neon(w, w);
+        const float32x2_t w3 = c_mul_neon(w2, w);
+        const float32x2_t w4 = c_mul_neon(w3, w);
+        const float32x2_t w5 = c_mul_neon(w4, w);
+        const float32x2_t w6 = c_mul_neon(w5, w);
+        const float32x2_t w7 = c_mul_neon(w6, w);
+
+        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        {
+            // Load inputs
+            float32x2_t a = wrapper::vload(x + M * k);
+            float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+            float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+            float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+            float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+            float32x2_t f = wrapper::vload(x + M * (k + 10 * Nx));
+            float32x2_t g = wrapper::vload(x + M * (k + 12 * Nx));
+            float32x2_t h = wrapper::vload(x + M * (k + 14 * Nx));
+
+            // Base-case prime transform
+            fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
+
+            // Store outputs
+            wrapper::vstore(X + M * k, a);
+            wrapper::vstore(X + M * (k + 2 * Nx), b);
+            wrapper::vstore(X + M * (k + 4 * Nx), c);
+            wrapper::vstore(X + M * (k + 6 * Nx), d);
+            wrapper::vstore(X + M * (k + 8 * Nx), e);
+            wrapper::vstore(X + M * (k + 10 * Nx), f);
+            wrapper::vstore(X + M * (k + 12 * Nx), g);
+            wrapper::vstore(X + M * (k + 14 * Nx), h);
+        }
+
+        w = c_mul_neon(w, w_m);
+    }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(config.axis > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
+    ARM_COMPUTE_UNUSED(config);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    ARM_COMPUTE_UNUSED(config);
+
+    if(output != nullptr)
+    {
+        auto_init_if_empty(*output, *input);
+    }
+
+    Window win = calculate_max_window(*input, Steps());
+    if(output != nullptr)
+    {
+        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTRadixStageKernel::NEFFTRadixStageKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
+{
+}
+
+void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo &config)
+{
+    // FFT table axis 0: [radix, first_stage]
+    static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
+
+    if(fft_table_axis0.empty())
+    {
+        fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
+        fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
+        fft_table_axis0[4][false] = &fft_radix_4_axes_0<false>;
+        fft_table_axis0[5][false] = &fft_radix_5_axes_0<false>;
+        fft_table_axis0[7][false] = &fft_radix_7_axes_0<false>;
+        fft_table_axis0[8][false] = &fft_radix_8_axes_0<false>;
+
+        fft_table_axis0[2][true] = &fft_radix_2_axes_0<true>;
+        fft_table_axis0[3][true] = &fft_radix_3_axes_0<true>;
+        fft_table_axis0[4][true] = &fft_radix_4_axes_0<true>;
+        fft_table_axis0[5][true] = &fft_radix_5_axes_0<true>;
+        fft_table_axis0[7][true] = &fft_radix_7_axes_0<true>;
+        fft_table_axis0[8][true] = &fft_radix_8_axes_0<true>;
+    }
+
+    _func_0 = fft_table_axis0[config.radix][config.is_first_stage];
+}
+
+void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo &config)
+{
+    // FFT table axis 1: [radix, first_stage]
+    static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
+
+    if(fft_table_axis1.empty())
+    {
+        fft_table_axis1[2] = &fft_radix_2_axes_1;
+        fft_table_axis1[3] = &fft_radix_3_axes_1;
+        fft_table_axis1[4] = &fft_radix_4_axes_1;
+        fft_table_axis1[5] = &fft_radix_5_axes_1;
+        fft_table_axis1[7] = &fft_radix_7_axes_1;
+        fft_table_axis1[8] = &fft_radix_8_axes_1;
+    }
+
+    _func_1 = fft_table_axis1[config.radix];
+}
+
+void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Output auto inizialitation if not yet initialized
+    if(output != nullptr)
+    {
+        auto_init_if_empty(*output->info(), *input->info()->clone());
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+
+    _input        = input;
+    _output       = output;
+    _run_in_place = (output == nullptr) || (output == input);
+    _Nx           = config.Nx;
+    _axis         = config.axis;
+    _radix        = config.radix;
+
+    switch(config.axis)
+    {
+        case 0:
+            set_radix_stage_axis0(config);
+            break;
+        case 1:
+            set_radix_stage_axis1(config);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+            break;
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+    const bool run_in_place = (output == nullptr) || (output == input);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (run_in_place) ? nullptr : output->clone().get(),
+                                                              config)
+                                .first);
+
+    return Status{};
+}
+
+std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
+{
+    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+}
+
+void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_UNUSED(info);
+
+    Window input_window = window;
+    input_window.set(_axis, 0);
+
+    Iterator in(_input, input_window);
+    Iterator out(_run_in_place ? _input : _output, input_window);
+
+    // Precompute FFT constants
+    const unsigned int NxRadix = _radix * _Nx;
+    const float        alpha   = 2.0f * kPi / float(NxRadix);
+    const float32x2_t  w_m{ cosf(alpha), -sinf(alpha) };
+
+    if(_axis == 0)
+    {
+        const unsigned int N = _input->info()->dimension(0);
+        execute_window_loop(input_window, [&](const Coordinates &)
+        {
+            _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
+        },
+        in, out);
+    }
+    else
+    {
+        const unsigned int N = _input->info()->dimension(0);
+        const unsigned int M = _input->info()->dimension(1);
+        execute_window_loop(input_window, [&](const Coordinates &)
+        {
+            _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M);
+        },
+        in, out);
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
new file mode 100644
index 0000000..56703ba
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
+{
+    const auto a = wrapper::vload(c_in);
+    auto       b = wrapper::vdiv(a, float32x2_t{ scale, scale });
+    if(is_conjugate)
+    {
+        const float img_part = wrapper::vgetlane(b, 1);
+        b                    = wrapper::vsetlane(-img_part, b, 1);
+    }
+
+    wrapper::vstore(c_out, b);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
+
+        // NEFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+    }
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTScaleKernel::NEFFTScaleKernel()
+    : _input(nullptr), _output(nullptr), _scale(), _run_in_place(false), _is_conj(false)
+{
+}
+
+void NEFFTScaleKernel::configure(ITensor *input, ITensor *output, const FFTScaleKernelInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+
+    _input        = input;
+    _output       = output;
+    _run_in_place = (output == nullptr) || (output == input);
+    _is_conj      = config.conjugate;
+    _scale        = config.scale;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config)
+{
+    ARM_COMPUTE_UNUSED(config);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_UNUSED(info);
+
+    Window input_window = window;
+    input_window.set(Window::DimX, 0);
+
+    Iterator in(_input, input_window);
+    Iterator out(_run_in_place ? _input : _output, input_window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
+    },
+    in, out);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 919efd2..81bcc8b 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,32 +49,30 @@
 {
     ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
 
-    static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE]
-    {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
-        { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
-        { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
-        { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
-        { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
-        { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
-        { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
-        { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
-        { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
-        { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
-        { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
-        { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
-        { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
-        { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
-        { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
-        { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
+    static const std::array<std::array<uint8_t, PERMUTATIONS>, PERM_SIZE> permutations_table{ { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
+            { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
+            { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
+            { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
+            { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
+            { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
+            { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
+            { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
+            { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
+            { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
+            { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
+            { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
+            { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
+            { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
+            { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
+            { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
 
-    };
+        } };
 
     const uint8x8x2_t index =
     {
         {
-            vld1_u8(permutations_table[k]),
-            vld1_u8(permutations_table[k] + 8)
+            vld1_u8(permutations_table[k].data()),
+            vld1_u8(permutations_table[k].data() + 8)
         }
     };
 
@@ -112,7 +110,7 @@
         . . 9 8 7 . . .
 
     */
-    static const uint8_t top_right[8] =
+    static const std::array<uint8_t, 8> top_right =
     {
         /* The register r.val[0] will be used to retrieve these texels:
         . . . 0 1 . . .
@@ -130,7 +128,7 @@
         255
     };
 
-    static const uint8_t bottom_right[8] =
+    static const std::array<uint8_t, 8> bottom_right =
     {
         /* The register r.val[1] will be used to retrieve these texels:
         . . . . . . 5 .
@@ -147,7 +145,7 @@
         20 /* low table, third row, elem 5, value 7 in the diagram above*/
     };
 
-    static const uint8_t top_left[8] =
+    static const std::array<uint8_t, 8> top_left =
     {
         /* The register r.val[2] will be used to retrieve these texels:
         . . F . . . . .
@@ -165,7 +163,7 @@
         2 /* top table, first row, elem 3, value F in the diagram above*/
     };
 
-    static const uint8_t bottom_left[8] =
+    static const std::array<uint8_t, 8> bottom_left =
     {
         /* The register r.val[3] will be used to retrieve these texels:
         B . . . . . . .
@@ -185,10 +183,10 @@
     const uint8x8x4_t reg =
     {
         {
-            vld1_u8(top_right),
-            vld1_u8(bottom_right),
-            vld1_u8(top_left),
-            vld1_u8(bottom_left)
+            vld1_u8(top_right.data()),
+            vld1_u8(bottom_right.data()),
+            vld1_u8(top_left.data()),
+            vld1_u8(bottom_left.data())
         }
     };
 
@@ -268,7 +266,7 @@
     return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
 }
 
-inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS])
+inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
 {
     /*
         This function determines whether the point 'p' is a corner.
@@ -287,7 +285,7 @@
     return corner_detected;
 }
 
-inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r)
+inline uint8x8x2_t create_circle_tbl(const std::array<uint8_t *const __restrict, 7> &buffer, size_t in_offset, const uint8x8x4_t &circle_index_r)
 {
     /*
         This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
@@ -329,7 +327,7 @@
     return tbl_circle_texels;
 }
 
-inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS])
+inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
 {
     uint8_t b = 255;
     uint8_t a = tolerance;
@@ -411,7 +409,7 @@
     Iterator in(_input, window);
     Iterator out(_output, window);
 
-    const uint8_t *const __restrict in_row[7] =
+    const std::array<uint8_t *const __restrict, 7> in_row
     {
         _input->ptr_to_element(Coordinates(-3, -3)),
         _input->ptr_to_element(Coordinates(-3, -2)),
@@ -429,7 +427,7 @@
         return p_is_in_ab && q_is_in_ab;
     };
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const size_t  in_offset = in.offset();
         const uint8_t p0        = *in.ptr();
@@ -455,11 +453,11 @@
                 /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
                 const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
 
-                if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data()))
+                if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index))
                 {
                     if(_non_max_suppression)
                     {
-                        score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data());
+                        score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index);
                     }
                     else
                     {
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index f4046e0..4127dc8 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -168,7 +168,7 @@
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates & id)
+    execute_window_loop(vertical, [&](const Coordinates &)
     {
         uint8_t *base_addr = start_valid_region + vertical_it.offset();
         // Fill left and right borders
@@ -188,7 +188,7 @@
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         uint8_t *base_addr = start_valid_region + plane_it.offset();
         // Top border
@@ -224,7 +224,7 @@
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates & id)
+    execute_window_loop(vertical, [&](const Coordinates &)
     {
         uint8_t *base_addr = start_valid_region + vertical_it.offset();
         // Fill left and right borders
@@ -244,7 +244,7 @@
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         uint8_t *base_addr = start_valid_region + plane_it.offset();
         // Top border
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index d1cff6f..50060b2 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,7 +111,7 @@
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates & id)
+    execute_window_loop(vertical, [&](const Coordinates &)
     {
         std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()), _border_size.left, constant_border_value);
         std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value);
@@ -122,7 +122,7 @@
     // All values are set at once
     Iterator horizontal_it(_tensor, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         for(size_t i = 0; i < _border_size.top; ++i)
         {
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 6551d9e..43554a0 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,7 +111,7 @@
 
     if(data_type == DataType::F32)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
             vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
@@ -121,7 +121,7 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     else if(data_type == DataType::F16)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const float16x8_t res = vfloorq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())));
             vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 7769d9e..c9299831 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -126,7 +126,7 @@
     win_out.set_dimension_step(Window::DimX, 16);
     Iterator out(output, win_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint16x4x4_t data =
         {
@@ -154,7 +154,7 @@
     win_out.set_dimension_step(Window::DimX, 16);
     Iterator out(output, win_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint32x4x4_t data =
         {
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index a100cd2..b561d1e 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -479,7 +479,7 @@
 
 void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8_t *mtx_a0 = ina.ptr();
         const uint8_t *mtx_b0 = inb.ptr();
@@ -599,7 +599,7 @@
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
         auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 33a5b4a..2293926 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,20 +106,17 @@
     Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
 
     AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win,
-                                                                 mm_result_access);
+    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
 
     if(a_offset != 0)
     {
         AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win,
-                                                                     vector_sum_col_access);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
     }
     if(b_offset != 0)
     {
         AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
-        window_changed = window_changed || update_window_and_padding(win,
-                                                                     vector_sum_row_access);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000..46e53ce
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
+{
+    return
+    {
+        {
+            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
+        }
+    };
+}
+
+inline int32x4x4_t load(const int32_t *ptr, int32_t x)
+{
+    return
+    {
+        {
+            vld1q_s32(ptr + x + 0),
+            vld1q_s32(ptr + x + 4),
+            vld1q_s32(ptr + x + 8),
+            vld1q_s32(ptr + x + 12)
+        }
+    };
+}
+
+inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
+{
+    int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
+
+    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+    return a_offset_term_s32;
+}
+
+inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset)
+{
+    int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr);
+    b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, b_offset);
+    return b_offset_term_s32;
+}
+
+inline int32x4x4_t get_k_offset(int32_t k_offset)
+{
+    return
+    {
+        {
+            vdupq_n_s32(k_offset),
+            vdupq_n_s32(k_offset),
+            vdupq_n_s32(k_offset),
+            vdupq_n_s32(k_offset)
+        }
+    };
+}
+
+template <bool    is_bounded_relu>
+inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+
+inline Window get_win_vector_sum(const Window &window)
+{
+    Window win_vector_sum(window);
+    win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    return win_vector_sum;
+}
+
+inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col)
+{
+    Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window));
+    return vector_sum_col_it;
+}
+
+inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row)
+{
+    Window win_vector_sum_row = get_win_vector_sum(window);
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+    return vector_sum_row_it;
+}
+
+inline Iterator get_bias_it(const Window &window, const ITensor *bias)
+{
+    Window win_bias(window);
+    win_bias.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    Iterator bias_it(bias, win_bias);
+    return bias_it;
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b),
+            vaddq_s32(a.val[1], b),
+            vaddq_s32(a.val[2], b),
+            vaddq_s32(a.val[3], b)
+        }
+    };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+    return
+    {
+        {
+            vaddq_s32(a.val[0], b.val[0]),
+            vaddq_s32(a.val[1], b.val[1]),
+            vaddq_s32(a.val[2], b.val[2]),
+            vaddq_s32(a.val[3], b.val[3])
+        }
+    };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+    return
+    {
+        {
+            vmulq_n_s32(a.val[0], mul_scalar),
+            vmulq_n_s32(a.val[1], mul_scalar),
+            vmulq_n_s32(a.val[2], mul_scalar),
+            vmulq_n_s32(a.val[3], mul_scalar)
+        }
+    };
+}
+
+template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
+                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
+                                                        GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+{
+    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+    if(!is_fixed_point)
+    {
+        // Combine quantization offset with other offsets.
+        offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
+    }
+    if(has_a_offset && has_b_offset)
+    {
+        offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
+    }
+    if(has_b_offset)
+    {
+        offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
+    }
+
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+        if(has_a_offset)
+        {
+            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+        }
+        if(has_bias)
+        {
+            in_s32 = add_s32(in_s32, load(bias_ptr, x));
+        }
+        if(!is_fixed_point || has_b_offset)
+        {
+            in_s32 = add_s32(in_s32, offset_term_s32);
+        }
+        if(!is_fixed_point)
+        {
+            in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+        }
+
+        if(is_fixed_point)
+        {
+            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+        }
+        else
+        {
+            vst1q_u8(out_it.ptr() + x, finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+        }
+    }
+    // Compute left-over elements
+    for(; x < window_end_x; ++x)
+    {
+        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+        if(has_a_offset)
+        {
+            in_value += (*(vector_sum_col_ptr + x) * a_offset);
+        }
+        if(has_bias)
+        {
+            in_value += *(bias_ptr + x);
+        }
+
+        if(is_fixed_point)
+        {
+            // Finalize and store the result
+            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
+                                                                         output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+        }
+        else
+        {
+            // Finalize quantization
+            in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+
+            // Bound and store the result
+            if(is_bounded_relu)
+            {
+                in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+            }
+            *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
+        }
+    }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage(const Window &window,
+                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+                                          GEMMLowpOutputStageInfo output_stage)
+{
+    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+    Iterator mm_result_it(mm_result, win);
+    Iterator out_it(output, win);
+
+    if((a_offset != 0) && (b_offset != 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+                                                + id.y() + (id.z() % depth_input) * height_input;
+                run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                                                               out_it,
+                                                                                                               result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                               output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+                                                + id.y() + (id.z() % depth_input) * height_input;
+                run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+        }
+    }
+    else if((a_offset == 0) && (b_offset != 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+                                                + id.y() + (id.z() % depth_input) * height_input;
+                run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_row_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+                                                + id.y() + (id.z() % depth_input) * height_input;
+                run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_row_it, mm_result_it, out_it);
+        }
+    }
+    else if((a_offset != 0) && (b_offset == 0))
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates & id)
+            {
+                const int  batch_id           = id.z() / depth_input;
+                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+                run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            vector_sum_col_it, mm_result_it, out_it);
+        }
+    }
+    else
+    {
+        if(bias != nullptr)
+        {
+            Iterator bias_it = get_bias_it(collapsed_window, bias);
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                 output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            bias_it, mm_result_it, out_it);
+        }
+        else
+        {
+            execute_window_loop(collapsed_window, [&](const Coordinates &)
+            {
+                run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+                                                                                                                  output_stage, window_step_x, window_start_x, window_end_x);
+            },
+            mm_result_it, out_it);
+        }
+        return;
+    }
+}
+
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = output->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *output)
+{
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps());
+
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop, we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+
+NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+{
+    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
+    {
+        { 0, &run_offset_contribution_output_stage<false, false, false> },
+        { 1, &run_offset_contribution_output_stage<true, false, false> },
+        { 2, &run_offset_contribution_output_stage<false, true, false> },
+        { 3, &run_offset_contribution_output_stage<true, true, false> },
+        { 4, &run_offset_contribution_output_stage<false, false, true> },
+        { 5, &run_offset_contribution_output_stage<true, false, true> },
+        { 6, &run_offset_contribution_output_stage<false, true, true> },
+        { 7, &run_offset_contribution_output_stage<true, true, true> }
+    };
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->info()->num_dimensions() > 1
+                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
+                                  && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
+
+    const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+    // key acts as a bitset, setting the first bit on reinterpret_as_3d,
+    // the second on is_bounded_relu, and the third on is_fixed_point.
+    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+    return map_function.find(key)->second;
+}
+} // namespace
+
+NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
+    : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
+      _output_stage(GEMMLowpOutputStageInfo())
+
+{
+}
+
+void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
+                                                              const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
+                                                              int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+                                                  bias != nullptr ? bias->info() : nullptr,                     // NOLINT
+                                                  output->info(), a_offset, b_offset, output_stage));           // NOLINT
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _bias           = bias;
+    _mm_result      = mm_result;
+    _output         = output;
+    _a_offset       = a_offset;
+    _b_offset       = b_offset;
+    _k_offset       = a_offset * b_offset * k;
+    _output_stage   = output_stage;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(mm_result->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+}
+
+Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
+                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+                                                               int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage);
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index f0ac695..4906e6a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -86,37 +86,6 @@
 namespace arm_compute
 {
 class Coordinates;
-
-/* Function used by the left-over for loop to perform the quantization */
-template <bool is_bounded_relu>
-inline uint8_t finalize_quantization(int32x4_t in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8_t min_u8, uint8_t max_u8)
-{
-    const static int32x4_t zero_s32      = vdupq_n_s32(0);
-    const static int32x4_t sat_value_s32 = vdupq_n_s32(255);
-
-    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    in_s32 = vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier);
-
-    // Round to the nearest division by a power-of-two using result_shift_s32
-    in_s32 = rounding_divide_by_pow2(in_s32, result_shift);
-
-    // Add the offset terms
-    in_s32 = vaddq_s32(in_s32, result_offset_after_shift_s32);
-
-    // Saturate negative values
-    in_s32 = vmaxq_s32(in_s32, zero_s32);
-    in_s32 = vminq_s32(in_s32, sat_value_s32);
-
-    auto out_u8 = static_cast<uint8_t>(vgetq_lane_s32(in_s32, 0));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = std::max(out_u8, min_u8);
-        out_u8 = std::min(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
 } // namespace arm_compute
 
 template <bool is_bounded_relu>
@@ -145,7 +114,7 @@
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias(_bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates & id)
+        execute_window_loop(win_collapsed, [&](const Coordinates &)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
@@ -188,17 +157,15 @@
 
                 // Add bias
                 in_value += bias_value;
-
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min),
-                                                                          static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
             }
         },
         in, out, bias);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates & id)
+        execute_window_loop(win_collapsed, [&](const Coordinates &)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
@@ -220,10 +187,10 @@
             // Compute left-over elements
             for(; x < window_end_x; ++x)
             {
-                const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
             }
         },
         in, out);
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 573373f..a221bd7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -183,7 +183,7 @@
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias(_bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
@@ -245,7 +245,7 @@
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win, [&](const Coordinates &)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 42353ed..5ac2323 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,7 +122,7 @@
     {
         case DataType::F32:
         {
-            execute_window_loop(window, [&](const Coordinates & id)
+            execute_window_loop(window, [&](const Coordinates &)
             {
                 const float32x4x4_t accum  = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
                 const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
@@ -144,7 +144,7 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
-            execute_window_loop(window, [&](const Coordinates & id)
+            execute_window_loop(window, [&](const Coordinates &)
             {
                 const float16x8x2_t accum  = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
                 const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 757dbbc..86bea84 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@
     Iterator in(input, window);
     Iterator out(output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
         const auto out_ptr = reinterpret_cast<float *>(out.ptr());
@@ -87,7 +87,7 @@
     Iterator in(input, window);
     Iterator out(output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
         const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index f182fb2..a82fae7 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -388,7 +388,7 @@
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
         auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
@@ -687,7 +687,7 @@
 
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
         const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 048c229..f412980 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,7 +80,7 @@
     static const int16x8_t two  = vdupq_n_s16(2);
     static const int16x8_t four = vdupq_n_s16(4);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
         uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index b62e281..0e4549e 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,7 @@
     static const int16x8_t six  = vdupq_n_s16(6);
     static const int16x8_t four = vdupq_n_s16(4);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -112,7 +112,7 @@
 
 BorderSize NEGaussian5x5VertKernel::border_size() const
 {
-    return BorderSize(2, 0);
+    return BorderSize{ 2, 0 };
 }
 
 void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
@@ -159,7 +159,7 @@
     const uint16x8_t six  = vdupq_n_u16(6);
     const uint16x8_t four = vdupq_n_u16(4);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const size_t input_offset_high_s16 = input.offset();
         const size_t input_offset_low_s16  = input.offset() + 16;
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 7a123e2..13cee19 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
 
 BorderSize NEGaussianPyramidHorKernel::border_size() const
 {
-    return BorderSize(0, 2);
+    return BorderSize{ 0, 2 };
 }
 
 void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output)
@@ -126,7 +126,7 @@
 
     Iterator out(_output, win_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16x2_t data_2q   = vld2q_u8(in.ptr());
         const uint8x16_t &data_even = data_2q.val[0];
@@ -155,7 +155,7 @@
 
 BorderSize NEGaussianPyramidVertKernel::border_size() const
 {
-    return BorderSize(2, 0);
+    return BorderSize{ 2, 0 };
 }
 
 void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output)
@@ -236,7 +236,7 @@
     const uint8_t *input_low_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3));
     const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4));
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         // Low data
         const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index c204395..c58b1c0 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -695,7 +695,7 @@
     Iterator phase(_input_phase, win_phase);
     Iterator out(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto mag_row_ptr   = reinterpret_cast<const int16_t *>(mag.ptr());
         const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 61221c1..34e68e7 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -719,7 +719,7 @@
 
     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
     },
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..b8e204c
--- /dev/null
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 16 / output->element_size();
+
+    // The window needs to be based on input as we copy all the widths of input
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::F16,
+                                                         DataType::U32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+
+    return Status{};
+}
+} // namespace
+
+NEHeightConcatenateLayerKernel::NEHeightConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _height_offset(0)
+{
+}
+
+void NEHeightConcatenateLayerKernel::configure(const ITensor *input, unsigned int height_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+
+    _input         = input;
+    _output        = output;
+    _height_offset = height_offset;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void NEHeightConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Offset output pointer to the correct position
+    uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _height_offset * _output->info()->strides_in_bytes()[Window::DimY];
+
+    // Create iterators
+    Iterator                input(_input, window);
+    Iterator                output(_output, window);
+    const DataType          dt           = _input->info()->data_type();
+    const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+    if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            vst1q_u8(output_ptr + output.offset(), vquantize(vdequantize(vld1q_u8(input.ptr()), input_qinfo), output_qinfo));
+        },
+        input, output);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = input.ptr();
+            const auto out_ptr = output_ptr + output.offset();
+
+            wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+        },
+        input, output);
+    }
+}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 2e3d9de..34af0cf 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -208,32 +208,48 @@
     const int end_x     = start_x + kernel_width * dilation_x;
     const int end_y     = start_y + kernel_height * dilation_y;
     const int pad_quant = kernel_width * input_c;
-
-    for(int y = start_y; y < end_y; y += dilation_y)
+    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1))
     {
-        if(y < 0 || y >= input_h)
+        for(int y = start_y; y < end_y; y += dilation_y)
         {
-            memset(out_ptr, pad_value, pad_quant * sizeof(T));
-            out_ptr += pad_quant;
+            //optimized for no dilation and no boundary pixels
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+            out_ptr += input_c * kernel_width;
         }
-        else
+    }
+    else
+    {
+        for(int y = start_y; y < end_y; y += dilation_y)
         {
-            for(int x = start_x; x < end_x; x += dilation_x)
+            if(y < 0 || y >= input_h)
             {
-                if(x < 0 || x >= input_w)
+                memset(out_ptr, pad_value, pad_quant * sizeof(T));
+                out_ptr += pad_quant;
+            }
+            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w)
+            {
+                for(int x = start_x; x < end_x; x += dilation_x)
                 {
-                    memset(out_ptr, pad_value, input_c * sizeof(T));
-                    out_ptr += input_c;
+                    if(x < 0 || x >= input_w)
+                    {
+                        memset(out_ptr, pad_value, input_c * sizeof(T));
+                        out_ptr += input_c;
+                    }
+                    else
+                    {
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
+                        out_ptr += input_c;
+                    }
                 }
-                else
-                {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
-                    out_ptr += input_c;
-                }
+            }
+            else
+            {
+                //optimized for no dilation and no boundary pixels
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+                out_ptr += input_c * kernel_width;
             }
         }
     }
-
     // Append 1 if the convolution layer has biases
     if(has_bias)
     {
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 16a3cf7..b6db5f0 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,7 @@
 
 BorderSize NEIntegralImageKernel::border_size() const
 {
-    return BorderSize(1, 0, 0, 1);
+    return BorderSize{ 1, 0, 0, 1 };
 }
 
 bool NEIntegralImageKernel::is_parallelisable() const
@@ -83,7 +83,7 @@
     const auto output_top_left = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(-1, -1)));
     const auto output_top_mid  = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(0, -1)));
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t input_pixels = vld1q_u8(input.ptr());
 
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index cda041d..efdcc44 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
         const auto sum_value           = *reinterpret_cast<const T *>(sum_it.ptr());
         const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        execute_window_loop(in_slice, [&](const Coordinates &)
         {
             const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
             const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
@@ -93,7 +93,7 @@
 
         auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        execute_window_loop(in_slice, [&](const Coordinates &)
         {
             const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
             const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
@@ -127,7 +127,7 @@
 
         auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        execute_window_loop(in_slice, [&](const Coordinates &)
         {
             const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
             const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 83593e7..ddf869e 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -405,9 +405,9 @@
 
     init_keypoints(list_start, list_end);
 
-    const int buffer_size = _window_dimension * _window_dimension;
-    int32_t   bilinear_ix[buffer_size];
-    int32_t   bilinear_iy[buffer_size];
+    const int            buffer_size = _window_dimension * _window_dimension;
+    std::vector<int32_t> bilinear_ix(buffer_size);
+    std::vector<int32_t> bilinear_iy(buffer_size);
 
     const int half_window = _window_dimension / 2;
 
@@ -444,7 +444,7 @@
         int iA12 = 0;
         int iA22 = 0;
 
-        std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix, bilinear_iy);
+        std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix.data(), bilinear_iy.data());
 
         const float A11 = iA11 * FLT_SCALE;
         const float A12 = iA12 * FLT_SCALE;
@@ -490,7 +490,7 @@
             int ib1 = 0;
             int ib2 = 0;
 
-            std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix, bilinear_iy);
+            std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix.data(), bilinear_iy.data());
 
             double b1 = ib1 * FLT_SCALE;
             double b2 = ib2 * FLT_SCALE;
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 4a318f0..8c09898 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -323,7 +323,7 @@
     Iterator gy(_gy, window);
     Iterator magnitude(_magnitude, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const int16x8x2_t input1 =
         {
@@ -369,7 +369,7 @@
     Iterator gy(_gy, window);
     Iterator phase(_phase, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const int16x8x2_t input1 =
         {
@@ -415,7 +415,7 @@
     Iterator magnitude(_magnitude, window);
     Iterator phase(_phase, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const int16x8x2_t input1 =
         {
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 7895b00..0af6305 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@
     uint64x1_t sum_squared = vdup_n_u64(0);
 
     // Calculate sum
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t in_data = vld1q_u8(iterator.ptr());
 
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 5bcdc7b..9dc1bc9 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,7 +87,7 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index 2b57b15..a0fab99 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -67,7 +67,7 @@
     collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator tensor_it(_tensor, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates & id)
+    execute_window_loop(collapsed, [&](const Coordinates &)
     {
         uint8_t *base_addr = start_valid_region + tensor_it.offset();
         // Set memory
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 5d1b4b3..fe3af0b 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -144,7 +144,7 @@
         float carry_min_scalar = std::numeric_limits<float>::max();
         float carry_max_scalar = std::numeric_limits<float>::lowest();
 
-        execute_window_loop(window_input, [&](const Coordinates & id)
+        execute_window_loop(window_input, [&](const Coordinates &)
         {
             int        x      = x_start;
             const auto in_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
@@ -203,7 +203,7 @@
 
     Iterator output(_output, window_output);
 
-    execute_window_loop(window_output, [&](const Coordinates & id)
+    execute_window_loop(window_output, [&](const Coordinates &)
     {
         vst1_f32(reinterpret_cast<float *>(output.ptr()), reset_values);
     },
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index befece2..08b27e3 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -152,7 +152,7 @@
 
     Iterator input(_input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int x = x_start;
 
@@ -209,7 +209,7 @@
 
     Iterator input(_input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int        x      = x_start;
         const auto in_ptr = reinterpret_cast<const int16_t *>(input.ptr());
@@ -268,7 +268,7 @@
 
     Iterator input(_input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         int        x      = x_start;
         const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
@@ -323,11 +323,11 @@
 template <class T, std::size_t... N>
 struct NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>
 {
-    static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
+    static const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> func_table;
 };
 
 template <class T, std::size_t... N>
-const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table[sizeof...(N)] =
+const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table
 {
     &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
 };
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 52dbe26..00536f0 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,7 +117,7 @@
     sort(p4, p2);
 }
 
-inline void sort21(uint8x8_t p[21])
+inline void sort21(std::array<uint8x8_t, 21> &p)
 {
     sort(p[0], p[1]);
     sort(p[2], p[3]);
@@ -222,7 +222,7 @@
     sort(p[10], p[16]);
 }
 
-inline void sort25(uint8x8_t p[25])
+inline void sort25(std::array<uint8x8_t, 25> &p)
 {
     sort(p[1], p[2]);
     sort(p[0], p[1]);
@@ -429,7 +429,7 @@
     const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
     const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -463,7 +463,7 @@
     const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
     const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
@@ -471,7 +471,7 @@
         const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
         const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
 
-        const uint8x8_t d[] =
+        const std::array<uint8x8_t, 10> d =
         {
             vget_low_u8(top2_data),
             vget_high_u8(top2_data),
@@ -485,7 +485,7 @@
             vget_high_u8(bot2_data)
         };
 
-        uint8x8_t p[25];
+        std::array<uint8x8_t, 25> p{ 0 };
         for(unsigned int i = 0; i < 5; ++i)
         {
             const unsigned int idx_d = i * 2;
@@ -524,7 +524,7 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         // Get min of rows
         uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
@@ -563,7 +563,7 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
 
@@ -593,7 +593,7 @@
     const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
     const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x8_t  top_data = vld1_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -624,7 +624,7 @@
     const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
     const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x8_t  top2_data = vld1_u8(input_top2_ptr + input.offset());
         const uint8x8_t  top_data  = vld1_u8(input_top_ptr + input.offset());
@@ -671,7 +671,7 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
 
@@ -717,7 +717,7 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
     }
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
 
@@ -754,7 +754,7 @@
     const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
     const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
@@ -762,7 +762,7 @@
         const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
         const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
 
-        uint8x8_t d[] =
+        std::array<uint8x8_t, 10> d =
         {
             vget_low_u8(top2_data),
             vget_high_u8(top2_data),
@@ -776,7 +776,7 @@
             vget_high_u8(bot2_data)
         };
 
-        uint8x8_t p[21];
+        std::array<uint8x8_t, 21> p{ 0 };
         p[0]  = d[0];
         p[1]  = vext_u8(d[0], d[1], 1);
         p[2]  = vext_u8(d[0], d[1], 2);
@@ -816,7 +816,7 @@
     const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
     const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
@@ -849,7 +849,7 @@
     const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
     const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
@@ -889,7 +889,7 @@
 
     std::array<uint8_t, mask_size> vals{ {} };
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         // Clear array
         std::fill(std::begin(vals), std::end(vals), 0);
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 8f97e6a..674a7c8 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -506,7 +506,7 @@
 
     const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         _func(input.ptr(), output.ptr(), input_stride);
     },
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index a4f5143..fa16484 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
@@ -42,12 +45,9 @@
 #include <arm_fp16.h> // needed for float16_t
 #endif                /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
 class Coordinates;
-} // namespace arm_compute
 
 namespace
 {
@@ -63,15 +63,29 @@
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                     "Output can only be U8 if both inputs are U8");
 
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() != DataType::QASYMM8,
+                                    "Input2 must be QASYMM8 if both input1 is QASYMM8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() == DataType::QASYMM8 && overflow_policy == ConvertPolicy::WRAP,
+                                    "ConvertPolicy cannot be WRAP if datatype is QASYMM8");
+
+    if(output->total_size() > 0)
+    {
+        if(output->data_type() == DataType::QASYMM8)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        }
+
+        const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    }
 
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
@@ -159,6 +173,34 @@
     return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
 }
 
+void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
+                                            const QuantizationInfo &input1_qua_info, const QuantizationInfo &input2_qua_info, const QuantizationInfo &output_qua_info)
+{
+    const auto input1 = static_cast<const qasymm8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const qasymm8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<qasymm8_t *__restrict>(output_ptr);
+
+    const qasymm8x16_t input1_q = vld1q_u8(input1);
+    const qasymm8x16_t input2_q = vld1q_u8(input2);
+
+    // Dequantitize inputs
+    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+    const QuantizationInfo tmp_qua_info = QuantizationInfo(output_qua_info.scale / scale, output_qua_info.offset);
+
+    const float32x4x4_t out_f32x4x4 =
+    {
+        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3])
+    };
+
+    const uint8x16_t result = vquantize(out_f32x4x4, tmp_qua_info);
+    vst1q_u8(output, result);
+}
+
 template <bool is_scale255, bool is_sat>
 void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
 {
@@ -291,7 +333,6 @@
     vst2q_s16(output, result);
 }
 
-template <bool is_scale255, bool is_sat>
 void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
 {
     const auto input1 = static_cast<const float *__restrict>(input1_ptr);
@@ -313,7 +354,35 @@
     vst4q_f32(output, result);
 }
 
-template <bool is_scale255, bool is_sat>
+void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
+{
+    const auto input1 = static_cast<const float *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const float *__restrict>(input2_ptr);
+    const auto output = static_cast<float *__restrict>(output_ptr);
+
+    const float32x4_t a = wrapper::vloadq(input1);
+    float32x4_t       b = wrapper::vloadq(input2);
+
+    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
+
+    const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
+    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+    float32x4_t res = wrapper::vmul(tmp0, b);
+
+    b = wrapper::vrev64(b);
+    b = wrapper::vmul(b, mask);
+
+    res = wrapper::vmla(res, tmp1, b);
+    wrapper::vstore(output, res);
+}
+
 void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -419,7 +488,7 @@
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+    : _func_float(nullptr), _func_int(nullptr), _func_qasymm8(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
@@ -439,6 +508,7 @@
     _output         = output;
     _scale          = scale;
     _scale_exponent = 0;
+    _func_qasymm8   = nullptr;
     _func_int       = nullptr;
     _func_float     = nullptr;
 
@@ -464,7 +534,11 @@
     const DataType dt_output = output->info()->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+    if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8)
+    {
+        _func_qasymm8 = &mul_saturate_QASYMM8_QASYMM8_QASYMM8_n;
+    }
+    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
     {
         if(is_scale_255)
         {
@@ -521,12 +595,12 @@
     }
     else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
     {
-        _func_float = &mul_F16_F16_F16_n<false, false>;
+        _func_float = &mul_F16_F16_F16_n;
         _func_int   = nullptr;
     }
     else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
     {
-        _func_float = &mul_F32_F32_F32_n<false, false>;
+        _func_float = &mul_F32_F32_F32_n;
         _func_int   = nullptr;
     }
     else
@@ -581,9 +655,20 @@
     Iterator input2(_input2, slice_input2);
     Iterator output(_output, slice);
 
-    if(_func_int != nullptr)
+    if(_func_qasymm8 != nullptr)
     {
-        execute_window_loop(collapsed, [&](const Coordinates & id)
+        execute_window_loop(collapsed, [&](const Coordinates &)
+        {
+            (*_func_qasymm8)(input1.ptr(), input2.ptr(), output.ptr(), _scale,
+                             _input1->info()->quantization_info(), _input2->info()->quantization_info(), _output->info()->quantization_info());
+            collapsed.slide_window_slice_3D(slice_input1);
+            collapsed.slide_window_slice_3D(slice_input2);
+        },
+        input1, input2, output);
+    }
+    else if(_func_int != nullptr)
+    {
+        execute_window_loop(collapsed, [&](const Coordinates &)
         {
             (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
             collapsed.slide_window_slice_3D(slice_input1);
@@ -594,7 +679,7 @@
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        execute_window_loop(collapsed, [&](const Coordinates & id)
+        execute_window_loop(collapsed, [&](const Coordinates &)
         {
             (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
             collapsed.slide_window_slice_3D(slice_input1);
@@ -608,5 +693,113 @@
 {
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    return BorderSize{ 0, border, 0, 0 };
 }
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
+
+Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    // Auto initialize output if not initialized
+    const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
+    auto_init_if_empty(*output, out_info);
+
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
+    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    INEKernel::configure(win_config.second);
+}
+
+Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
+    Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
+
+BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
+    return { 0, border, 0, 0 };
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index d00a4af..ac2ffa1 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -138,7 +138,6 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
                                     || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
     }
@@ -353,7 +352,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
 
     // Check output dimensions
-    unsigned int pooled_w, pooled_h;
+    unsigned int pooled_w;
+    unsigned int pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
                                                      input->info()->dimension(idx_height),
                                                      pool_size.x(),
@@ -640,6 +640,15 @@
             }
         }
 
+        const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+        const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+        if(input_qinfo != output_qinfo)
+        {
+            const auto requantized_output = vquantize(vdequantize(vcombine_u8(lower_res, upper_res), input_qinfo), output_qinfo);
+            lower_res                     = vget_low_u8(requantized_output);
+            upper_res                     = vget_high_u8(requantized_output);
+        }
+
         // Store result
         if(pool_stride_x == 1)
         {
@@ -805,6 +814,9 @@
     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
+    const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+
     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
     const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
@@ -814,6 +826,8 @@
         const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
         const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
         const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+        uint8x8_t  fres        = {};
+        uint8x16_t fqres       = {};
 
         if(pooling_type == PoolingType::AVG)
         {
@@ -869,7 +883,7 @@
                 scale_vector_s16x8(exclude_padding, res, id, 0, 1,
                                    pool_size, upper_bound_w, upper_bound_h,
                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
+                fres = vmovn_u16(res);
             }
             else
             {
@@ -881,8 +895,7 @@
                 scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
                                    pool_size, upper_bound_w, upper_bound_h,
                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+                fqres = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
             }
         }
         else
@@ -896,14 +909,31 @@
             {
                 const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
                 static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+                fres                              = vtbl2_u8(table, lookup_val);
             }
             else
             {
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
+                fqres = final_max;
             }
         }
+
+        // Store result
+        if(pool_stride_x == 1)
+        {
+            if(input_qinfo != output_qinfo)
+            {
+                fqres = vquantize(vdequantize(fqres, input_qinfo), output_qinfo);
+            }
+            vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), fqres);
+        }
+        else
+        {
+            if(input_qinfo != output_qinfo)
+            {
+                fres = vquantize(vdequantize(fres, input_qinfo), output_qinfo);
+            }
+            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), fres);
+        }
     },
     input, output);
 }
@@ -1641,6 +1671,11 @@
         }
 
         // Store result
+        const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+        const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+        res                                  = (input_qinfo != output_qinfo) ? sqcvt_qasymm8_f32(scvt_f32_qasymm8(res, input_qinfo.scale, input_qinfo.offset), output_qinfo.scale,
+                                                                                                 output_qinfo.offset) :
+                                               res;
         *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
     },
     input, output);
@@ -1663,7 +1698,9 @@
     const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
+    const float32x4_t       half_scale_v = vdupq_n_f32(0.5f);
+    const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1713,6 +1750,12 @@
 
             uint8x8_t res1 = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
             uint8x8_t res2 = vmovn_u16(vcombine_u16(vmovn_u32(vres3), vmovn_u32(vres4)));
+            if(input_qinfo != output_qinfo)
+            {
+                const auto requantized_output = vquantize(vdequantize(vcombine_u8(res1, res2), input_qinfo), output_qinfo);
+                res1                          = vget_low_u8(requantized_output);
+                res2                          = vget_high_u8(requantized_output);
+            }
 
             // Store result
             vst1_u8(output.ptr(), res1);
@@ -1733,7 +1776,7 @@
             }
 
             // Store result
-            vst1q_u8(output.ptr(), vres);
+            vst1q_u8(output.ptr(), (input_qinfo != output_qinfo) ? vquantize(vdequantize(vres, input_qinfo), output_qinfo) : vres);
         }
     },
     input, output);
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index b49400a..4deeb1c 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,152 +23,140 @@
  */
 #include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/CPP/Validate.h"
+
 #include <arm_neon.h>
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+inline float32x4x4_t load_value(const float *input_ptr)
 {
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-    // Configure window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
-    // Update window and padding
-    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
+    return { wrapper::vloadq(input_ptr),
+             wrapper::vloadq(input_ptr + 4),
+             wrapper::vloadq(input_ptr + 8),
+             wrapper::vloadq(input_ptr + 12) };
 }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 } // namespace
 
 NEQuantizationLayerKernel::NEQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+    : _input(nullptr), _output(nullptr)
 {
 }
 
-void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
-    _input   = input;
-    _output  = output;
-    _min_max = min_max;
+    _input  = input;
+    _output = output;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
+    Window win_config = calculate_max_window(*input->info(), Steps());
 
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 
-    INEKernel::configure(std::get<1>(win_config));
+    INEKernel::configure(win_config);
 }
 
-Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
 
     return Status{};
 }
 
+template <typename T>
+void NEQuantizationLayerKernel::quantize(const Window &window, const QuantizationInfo &qinfo)
+{
+    constexpr auto window_step    = 16;
+    const auto     window_start_x = static_cast<int>(window.x().start());
+    const auto     window_end_x   = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step); x += window_step)
+        {
+            wrapper::vstore(&output_ptr[x], vquantize(load_value(&input_ptr[x]), qinfo));
+        }
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            output_ptr[x] = qinfo.quantize(input_ptr[x], rounding_policy);
+        }
+    },
+    input, output);
+}
+
 void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window window_input_output(window);
-    window_input_output.set(3, Window::Dimension(0, 1, 1));
+    const QuantizationInfo &qinfo = _output->info()->quantization_info();
 
-    Window window_min_max;
-    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
-    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, window_input_output);
-    Iterator output(_output, window_input_output);
-    Iterator min_max(_min_max, window_min_max);
-
-    execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+    switch(_input->info()->data_type())
     {
-        // Get the min and max
-        float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
-        float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
-
-        // Saturate the result if min = max
-        if(min == max)
-        {
-            min = 0.0f;
-            max = 1.0f;
-        }
-
-        const float32x4_t vmin             = vdupq_n_f32(min);
-        const float32x4_t inv_range        = vdupq_n_f32(1.0f / (max - min));
-        const float32x4_t quantization_max = vdupq_n_f32(255.0f);
-        const float32x4_t quantization_mul = vdupq_n_f32(256.0f);
-
-        // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
-        execute_window_loop(window_input_output, [&](const Coordinates & id)
-        {
-            // Get the input values
-            const auto    input_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
-            float32x4x2_t val       = vld2q_f32(input_ptr);
-
-            // Map float values to range [0.0, 1.0]
-            val.val[0] = vsubq_f32(val.val[0], vmin);
-            val.val[1] = vsubq_f32(val.val[1], vmin);
-            val.val[0] = vmulq_f32(val.val[0], inv_range);
-            val.val[1] = vmulq_f32(val.val[1], inv_range);
-
-            // Quantize
-            val.val[0] = vmulq_f32(val.val[0], quantization_mul);
-            val.val[1] = vmulq_f32(val.val[1], quantization_mul);
-            val.val[0] = vminq_f32(val.val[0], quantization_max);
-            val.val[1] = vminq_f32(val.val[1], quantization_max);
-
-            const uint32x4_t   val_u32_low  = vcvtq_u32_f32(val.val[0]);
-            const uint32x4_t   val_u32_high = vcvtq_u32_f32(val.val[1]);
-            const uint16x4x2_t val_u16      = vzip_u16(vmovn_u32(val_u32_low), vmovn_u32(val_u32_high));
-
-            const uint8x8_t quantized = vmovn_u16(vcombine_u16(val_u16.val[0], val_u16.val[1]));
-
-            // Store the quantized values
-            auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
-            vst1_u8(output_ptr, quantized);
-        },
-        input, output);
-    },
-    min_max);
+        case DataType::F32:
+            NEQuantizationLayerKernel::quantize<float>(window, qinfo);
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            NEQuantizationLayerKernel::quantize<float16_t>(window, qinfo);
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
 }
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 84cb223..aa20d1f 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -602,7 +602,7 @@
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        execute_window_loop(in_slice, [&](const Coordinates &)
         {
             neon_vector vec_res_value = { 0 };
             if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
@@ -688,13 +688,70 @@
     }
 };
 
+template <typename T, int S, int axis, ReductionOperation op>
+struct RedOpYZW_complex
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int, const ReductionOperation)
+    {
+        ARM_COMPUTE_UNUSED(out_slice);
+        ARM_COMPUTE_ERROR_ON(axis != 2);
+
+        const size_t stride_z = in_info.strides_in_bytes()[axis];
+
+        execute_window_loop(in_slice, [&](const Coordinates &)
+        {
+            neon_vector vec_res_value_0 = { 0 };
+            neon_vector vec_res_value_1 = { 0 };
+
+            vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+            {
+                T *in_ptr_0;
+                T *in_ptr_1;
+                switch(axis)
+                {
+                    case 2:
+                        in_ptr_0 = reinterpret_cast<T *>(input.ptr() + stride_z * dim);
+                        in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 16 + stride_z * dim);
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+                const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+
+                switch(op)
+                {
+                    case ReductionOperation::SUM:
+                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value_0);
+            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + 16), vec_res_value_1);
+
+        },
+        input, output);
+    }
+};
+
 struct RedOpYZW_qasymm8
 {
     inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        execute_window_loop(in_slice, [&](const Coordinates &)
         {
             uint32x4x4_t vec_res_idx{ { 0 } };
             auto         vec_res_value1 = vdupq_n_u32(0);
@@ -848,6 +905,31 @@
 
 void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
+    const bool is_complex = (input->info()->num_channels() == 2);
+
+    if(is_complex)
+    {
+        switch(axis)
+        {
+            case 2:
+                switch(input->info()->data_type())
+                {
+                    case DataType::F32:
+                        switch(op)
+                        {
+                            case ReductionOperation::SUM:
+                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
+    }
+
     switch(axis)
     {
         case 0:
@@ -917,7 +999,17 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+
+    if(input->num_channels() == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(op != ReductionOperation::SUM);
+        ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
+    }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
@@ -929,12 +1021,12 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+            ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
         }
         else
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
         const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
         const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
@@ -952,7 +1044,7 @@
     // Output auto initialization if not yet initialized
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
-    auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
 
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index edb3ffe..3c871de 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,8 +113,8 @@
     AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
 
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapx_access(map_x->info(),    0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapy_access(map_y->info(),    0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
 
@@ -140,7 +140,7 @@
     const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
     const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
         const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
@@ -190,7 +190,7 @@
     const size_t height    = _input->info()->dimension(1);
     const size_t in_stride = _input->info()->strides_in_bytes()[1];
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
         const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 8baea2b..ece5aa4 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,47 @@
 }
 } // namespace
 
-template <typename T>
-void NEReorgLayerKernel::run_reorg(const Window &window)
+NEReorgLayerKernel::NEReorgLayerKernel()
+    : _input(nullptr), _output(nullptr), _stride(1)
 {
+}
+
+void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+    _input  = input;
+    _output = output;
+    _stride = stride;
+
+    // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+
+    ICPPKernel::configure(win);
+}
+
+Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+    return Status{};
+}
+
+void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
     const DataLayout data_layout = _input->info()->data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -101,72 +139,8 @@
         map_coords.set(idx_c, c % out_c);
 
         // Perform mapping
-        *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
+        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
     },
     out);
 }
-
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
-{
-}
-
-void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-
-    _func   = nullptr;
-    _input  = input;
-    _output = output;
-    _stride = stride;
-
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
-            break;
-        case 2:
-            _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
-            break;
-        case 4:
-            _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
-    // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
-    return Status{};
-}
-
-void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-    if(_func != nullptr)
-    {
-        (this->*_func)(window);
-    }
-}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 62e4882..36398cf 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -189,31 +189,21 @@
     switch(_input->info()->data_type())
     {
         case DataType::F32:
-            run_reverse<float>(window, _input, _axis, _output);
+        case DataType::U32:
+        case DataType::S32:
+            run_reverse<uint32_t>(window, _input, _axis, _output);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            run_reverse<float16_t>(window, _input, _axis, _output);
-            break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::U32:
-            run_reverse<uint32_t>(window, _input, _axis, _output);
-            break;
-        case DataType::S32:
-            run_reverse<int32_t>(window, _input, _axis, _output);
-            break;
         case DataType::S16:
-            run_reverse<int16_t>(window, _input, _axis, _output);
-            break;
         case DataType::U16:
             run_reverse<uint16_t>(window, _input, _axis, _output);
             break;
         case DataType::QASYMM8:
         case DataType::U8:
-            run_reverse<uint8_t>(window, _input, _axis, _output);
-            break;
         case DataType::S8:
-            run_reverse<int8_t>(window, _input, _axis, _output);
+            run_reverse<uint8_t>(window, _input, _axis, _output);
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 3d300ef..3354039 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -45,7 +45,7 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                           const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
-                          BorderMode border_mode, SamplingPolicy sampling_policy)
+                          BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
@@ -53,7 +53,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
     ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_UNUSED(border_mode);
+    ARM_COMPUTE_RETURN_ERROR_ON(!use_padding && border_mode != BorderMode::CONSTANT);
+    ARM_COMPUTE_UNUSED(constant_border_value);
 
     const DataLayout data_layout = input->data_layout();
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)) == 0);
@@ -121,40 +122,44 @@
 
 std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output,
                                                              InterpolationPolicy policy, bool border_undefined,
-                                                             SamplingPolicy sampling_policy, BorderSize border_size)
+                                                             SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
 {
     bool   window_changed{ false };
     Window win{};
 
-    const unsigned int num_elems_processed_per_iteration = (policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
+    const unsigned int num_elems_processed_per_iteration = (use_padding && policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
 
     // Configure kernel window
     win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic input_access(input, 0, -border_size.top,
-                                    ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration),
-                                    input->tensor_shape()[1]);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    window_changed = update_window_and_padding(win, input_access, output_access);
-    output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(),
-                                                          policy, sampling_policy, border_undefined));
+    if(use_padding)
+    {
+        AccessWindowStatic input_access(input, 0, -border_size.top, use_padding ? ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration) : num_elems_processed_per_iteration,
+                                        input->tensor_shape()[1]);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), policy, sampling_policy, border_undefined));
+    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                        InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+                                                        InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
 {
     std::pair<Status, Window> win_config;
     switch(input->data_layout())
     {
         case DataLayout::NCHW:
+            if(!use_padding)
+            {
+                return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{});
+            }
             win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, policy, border_undefined, sampling_policy, border_size);
             break;
         case DataLayout::NHWC:
-            win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size);
+            win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size, use_padding);
             break;
         default:
             win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
@@ -167,6 +172,12 @@
 inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
                                     float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c)
 {
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+
+    window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator in(input, win_in);
     Iterator out(output, window);
 
@@ -174,18 +185,28 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const auto offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int  in_yi      = (id.z() + 0.5f) * hr;
-        const int  offset_row = in_yi * stride_h + id.x() * stride_c;
-        wrapper::vstore(reinterpret_cast<T *>(out.ptr()),
-                        wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row)));
+        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+        const int     in_yi      = (id.z() + 0.5f) * hr;
+        const int     offset_row = in_yi * stride_h;
+        int32_t       x          = window_start_x;
+        for(; x < window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
+                            wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c)));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(reinterpret_cast<T *>(out.ptr()) + x) =
+                *(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c));
+        }
     },
     in, out);
 }
 
-template <typename T>
+template <typename T, typename ConstType>
 inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
-                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h,
+                                     size_t stride_c, BorderMode border_mode, PixelValue constant_border_value, bool use_padding)
 {
     Iterator in(input, win_in);
     Iterator out(output, window);
@@ -196,7 +217,15 @@
     const int input_width  = input->info()->dimension(1);
     const int input_height = input->info()->dimension(2);
 
-    const T *border_area = reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+    T border_value;
+    if(use_padding)
+    {
+        border_value = *reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+    }
+    else
+    {
+        border_value = static_cast<T>(constant_border_value.get<ConstType>());
+    }
 
     auto is_valid = [](int x, int low_x, int high_x, int y, int low_y, int high_y)
     {
@@ -220,14 +249,17 @@
 
         if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
         {
-            T a00 = 0, a01 = 0, a10 = 0, a11 = 0;
+            T a00 = 0;
+            T a01 = 0;
+            T a10 = 0;
+            T a11 = 0;
 
             if(border_mode == BorderMode::CONSTANT)
             {
-                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : *border_area;
-                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : *border_area;
-                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : *border_area;
-                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : *border_area;
+                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : border_value;
+                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : border_value;
+                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : border_value;
+                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : border_value;
             }
             else if(border_mode == BorderMode::REPLICATE)
             {
@@ -279,7 +311,7 @@
         {
             if(border_mode == BorderMode::CONSTANT)
             {
-                *reinterpret_cast<T *>(out.ptr()) = *border_area;
+                *reinterpret_cast<T *>(out.ptr()) = border_value;
             }
             else if(border_mode == BorderMode::REPLICATE)
             {
@@ -294,7 +326,8 @@
 } // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _sampling_offset(0)
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _constant_border_value(PixelValue()),
+      _sampling_offset(0), _use_padding(true)
 {
 }
 
@@ -304,31 +337,33 @@
 }
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
-                              ITensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
+                              ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
+                              bool use_padding)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
                                                   dx != nullptr ? dx->info() : nullptr,
                                                   dy != nullptr ? dy->info() : nullptr,
                                                   offsets != nullptr ? offsets->info() : nullptr,
                                                   output->info(),
-                                                  policy, border_mode, sampling_policy));
+                                                  policy, border_mode, constant_border_value, sampling_policy, use_padding));
 
     // Get data layout and width/height indices
     const DataLayout data_layout = input->info()->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    _input       = input;
-    _output      = output;
-    _offsets     = offsets;
-    _dx          = dx;
-    _dy          = dy;
-    _policy      = policy;
-    _border_size = BorderSize(1);
-    _border_mode = border_mode;
+    _input                 = input;
+    _output                = output;
+    _offsets               = offsets;
+    _dx                    = dx;
+    _dy                    = dy;
+    _policy                = policy;
+    _border_size           = BorderSize(1);
+    _border_mode           = border_mode;
+    _constant_border_value = constant_border_value;
+    _use_padding           = use_padding;
 
     if(sampling_policy == SamplingPolicy::CENTER)
     {
@@ -342,7 +377,7 @@
     // Add constant border only on top in case of NHWC layout
     if(data_layout == DataLayout::NHWC)
     {
-        _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+        _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR && use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
     }
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
@@ -379,7 +414,8 @@
                                                                          dy != nullptr ? dy->info() : nullptr,
                                                                          offsets != nullptr ? offsets->info() : nullptr,
                                                                          output->info(),
-                                                                         policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size());
+                                                                         policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size(), use_padding);
+
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
@@ -904,8 +940,8 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                  window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+                scale_bilinear_nhwc_core<uint8_t, uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
             }
             break;
         }
@@ -917,8 +953,8 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                  window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+                scale_bilinear_nhwc_core<int16_t, int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
             }
             break;
         }
@@ -932,8 +968,8 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                    window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+                scale_bilinear_nhwc_core<float16_t, half>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+                                                          window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
             }
             break;
         }
@@ -946,8 +982,8 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+                scale_bilinear_nhwc_core<float, float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+                                                       window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
             }
             break;
         }
@@ -959,7 +995,7 @@
 
 Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                                const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
-                               BorderMode border_mode, SamplingPolicy sampling_policy)
+                               BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     BorderSize border_size(1);
     if(input->data_layout() == DataLayout::NHWC)
@@ -967,13 +1003,13 @@
         border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, sampling_policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                               dx != nullptr ? dx->clone().get() : nullptr,
                                                               dy != nullptr ? dy->clone().get() : nullptr,
                                                               offsets != nullptr ? offsets->clone().get() : nullptr,
                                                               output->clone().get(),
-                                                              policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size)
+                                                              policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size, use_padding)
                                 .first);
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index f23c31b..3add699 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -161,7 +161,7 @@
 
     if(_run_scharr_x && _run_scharr_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
 
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -197,7 +197,7 @@
     }
     else if(_run_scharr_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
 
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -232,7 +232,7 @@
     }
     else if(_run_scharr_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
 
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index f2697bc..c03e5f0 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -54,7 +54,7 @@
     Iterator input2(in2, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
+    execute_window_loop(win, [&](const Coordinates &)
     {
         auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
         const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index 5a80630..7a27203 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 
 BorderSize NESobel3x3Kernel::border_size() const
 {
-    return BorderSize(1);
+    return BorderSize{ 1 };
 }
 
 void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -117,7 +117,7 @@
 
     if(_run_sobel_y && _run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
             const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -181,7 +181,7 @@
     }
     else if(_run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
             const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -229,7 +229,7 @@
     }
     else if(_run_sobel_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
             const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index 30e7817..a92cfc2 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -120,7 +120,7 @@
         static const int16x8_t two      = vdupq_n_s16(2);
         static const int16x8_t minustwo = vdupq_n_s16(-2);
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -154,7 +154,7 @@
         static const int16x8_t two      = vdupq_n_s16(2);
         static const int16x8_t minustwo = vdupq_n_s16(-2);
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -180,7 +180,7 @@
         static const int16x8_t six  = vdupq_n_s16(6);
         static const int16x8_t four = vdupq_n_s16(4);
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr());
 
@@ -211,7 +211,7 @@
 
 BorderSize NESobel5x5VertKernel::border_size() const
 {
-    return BorderSize(2, 0);
+    return BorderSize{ 2, 0 };
 }
 
 void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -312,7 +312,7 @@
 
     if(_run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             // Convert offset from uint8_t* to uint16_t*
             const size_t input_offset_high_s16 = input_x.offset() / 2;
@@ -361,7 +361,7 @@
 
     if(_run_sobel_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             // Convert offset from uint8_t* to uint16_t*
             const size_t input_offset_high_s16 = input_y.offset() / 2;
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 40a3e31..f2b42cc 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -215,7 +215,7 @@
 
     if(_run_sobel_y && _run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr() - 3);
 
@@ -244,7 +244,7 @@
     }
     else if(_run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr() - 3);
 
@@ -269,7 +269,7 @@
     }
     else if(_run_sobel_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             const uint8x16_t data = vld1q_u8(input.ptr() - 3);
 
@@ -301,7 +301,7 @@
 
 BorderSize NESobel7x7VertKernel::border_size() const
 {
-    return BorderSize(3, 0);
+    return BorderSize{ 3, 0 };
 }
 
 void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -382,7 +382,7 @@
 
     if(_run_sobel_x)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
 
@@ -453,7 +453,7 @@
 
     if(_run_sobel_y)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window, [&](const Coordinates &)
         {
             auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
 
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
new file mode 100644
index 0000000..2e46b14
--- /dev/null
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const DataLayout data_layout = input->data_layout();
+        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                 const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const DataLayout data_layout = input->data_layout();
+        const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
+    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _padding_left(), _block_shape_x(), _block_shape_y()
+{
+}
+
+void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+
+    _input       = input;
+    _block_shape = block_shape;
+    _paddings    = paddings;
+    _output      = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+    ICPPKernel::configure(win);
+}
+
+void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                          ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+
+    _input         = input;
+    _output        = output;
+    _block_shape_x = block_shape_x;
+    _block_shape_y = block_shape_y;
+    _padding_left  = padding_left;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+    INEKernel::configure(win);
+}
+
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
+    return Status{};
+}
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                           const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    return Status{};
+}
+
+void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    if(_block_shape != nullptr)
+    {
+        // Retrieve the block shapes dynamically
+        _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
+        _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
+    }
+
+    if(_paddings != nullptr)
+    {
+        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
+        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+        _padding_left           = Size2D(pad_left_x, pad_left_y);
+    }
+    const DataLayout data_layout  = _input->info()->data_layout();
+    const int        height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        element_size = _input->info()->element_size();
+
+    const size_t height     = _input->info()->dimension(height_idx);
+    const size_t width      = _input->info()->dimension(width_idx);
+    const size_t batch_size = _input->info()->dimension(3);
+
+    Window slice_out = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_4D();
+
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_in.set(3, Window::Dimension(0, 0, 0));
+
+    int batch_id = 0;
+
+    // Main loop for NCHW and NHWC
+    if(_output->info()->data_layout() == DataLayout::NCHW)
+    {
+        do
+        {
+            Iterator out(_output, slice_out);
+            execute_window_loop(slice_out, [&](const Coordinates & id)
+            {
+                const size_t out_x = id.x();
+                const size_t out_y = id.y();
+                const size_t z     = id.z();
+                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                {
+                    const int   w    = batch_id % batch_size;
+                    const int   in_x = pos_x - _padding_left.x();
+                    const int   in_y = pos_y - _padding_left.y();
+                    Coordinates input_coords{ in_x, in_y, z, w };
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                }
+            },
+            out);
+            ++batch_id;
+        }
+        while(window.slide_window_slice_3D(slice_out));
+    }
+    else
+    {
+        do
+        {
+            Iterator out(_output, slice_out);
+            execute_window_loop(slice_out, [&](const Coordinates & id)
+            {
+                const size_t out_x = id.y();
+                const size_t out_y = id.z();
+                const size_t z     = id.x();
+                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                {
+                    const int   w    = batch_id % batch_size;
+                    const int   in_x = pos_x - _padding_left.x();
+                    const int   in_y = pos_y - _padding_left.y();
+                    Coordinates input_coords{ z, in_x, in_y, w };
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                }
+            },
+            out);
+            ++batch_id;
+        }
+        while(window.slide_window_slice_3D(slice_out));
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 0c33f36..3447d59 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -87,7 +87,7 @@
 } // namespace
 
 NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
 {
 }
 
@@ -101,22 +101,6 @@
     _axis      = axis;
     _idx_input = idx_input;
 
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEStackLayerKernel::run_stack<uint8_t>;
-            break;
-        case 2:
-            _func = &NEStackLayerKernel::run_stack<uint16_t>;
-            break;
-        case 4:
-            _func = &NEStackLayerKernel::run_stack<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
 
@@ -137,15 +121,6 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
-    {
-        (this->*_func)(window);
-    }
-}
-
-template <typename T>
-void NEStackLayerKernel::run_stack(const Window &window)
-{
     Window window_out;
     window_out.use_tensor_dimensions(_output->info()->tensor_shape());
 
@@ -160,9 +135,9 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        Coordinates id_out                           = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx                              = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
     },
     input);
 }
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index 958f4a9..536c220 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
     Iterator input  = Iterator(_input, window);
     Iterator output = Iterator(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
         auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -92,7 +92,7 @@
     Iterator input  = Iterator(_input, window);
     Iterator output = Iterator(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8_t *input_ptr  = input.ptr();
         uint8_t       *output_ptr = output.ptr();
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 5ef0693..ae9c62b 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,7 +86,7 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
         const uint8x16_t mask = vcgtq_u8(data, threshold);
@@ -106,7 +106,7 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(input.ptr());
 
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index aae85c6..d3d88b3 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -130,7 +130,7 @@
     Iterator  output(_output, window_out);
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
         const float32x4_t data_out1 = { vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 1), vgetq_lane_f32(data, 1) };
@@ -157,7 +157,7 @@
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
     const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
         auto              out  = reinterpret_cast<float *>(output.ptr());
@@ -182,7 +182,7 @@
     Iterator  output(_output, window_out);
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const uint8x16_t data      = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
         const uint8x16_t data_out1 = { vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 1), vgetq_lane_u8(data, 1),
@@ -218,7 +218,7 @@
 
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
     const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(uint8_t);
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
         auto             out  = reinterpret_cast<uint8_t *>(output.ptr());
@@ -245,7 +245,7 @@
     Iterator  output(_output, window_out);
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const float16x8_t data      = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
         const float16x8_t data_out1 = { vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 1), vgetq_lane_f16(data, 1),
@@ -278,7 +278,7 @@
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
     const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float16_t);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates &)
     {
         const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
         auto              out  = reinterpret_cast<float16_t *>(output.ptr());
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 4a0cf27..624833a 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,59 +34,6 @@
 
 namespace
 {
-template <typename T>
-void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
-{
-    const unsigned int kernel_size_x   = input->info()->dimension(0);
-    const unsigned int kernel_size_y   = input->info()->dimension(1);
-    const unsigned int kernel_depth    = input->info()->dimension(2);
-    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(input, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(bias != nullptr)
-        {
-            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
-        }
-    },
-    in);
-}
-
 TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 {
     TensorShape output_shape{ input->tensor_shape() };
@@ -141,7 +88,7 @@
 } // namespace
 
 NEWeightsReshapeKernel::NEWeightsReshapeKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
@@ -161,30 +108,6 @@
     _bias   = bias;
     _output = output;
 
-    switch(_input->info()->element_size())
-    {
-        case 4:
-        {
-            _func = &weights_reshape<uint32_t>;
-            break;
-        }
-        case 2:
-        {
-            _func = &weights_reshape<uint16_t>;
-            break;
-        }
-        case 1:
-        {
-            _func = &weights_reshape<uint8_t>;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR_ON("Element size not supported");
-            break;
-        }
-    }
-
     // Configure kernel
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -205,5 +128,52 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    (*_func)(_input, _bias, _output, window);
+    const unsigned int kernel_size_x   = _input->info()->dimension(0);
+    const unsigned int kernel_size_y   = _input->info()->dimension(1);
+    const unsigned int kernel_depth    = _input->info()->dimension(2);
+    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(_input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                {
+                    std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(_bias != nullptr)
+        {
+            std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
+        }
+    },
+    in);
 }
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 3e76a08..263ded0 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -238,8 +238,7 @@
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
-    : _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0)
-
+    : _transform(nullptr), _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0)
 {
 }
 
@@ -263,11 +262,10 @@
     _matrix_stride       = matrix_stride;
     _num_output_channels = num_output_channels;
     _num_input_channels  = num_input_channels;
+    _transform           = arm_compute::support::cpp14::make_unique<WeightsTransform>(num_output_channels, num_input_channels);
 
-    const int        matrix_row_stride = roundup(num_output_channels, WinogradConv::N_BLOCK);
-    WeightsTransform transform(nullptr, nullptr, matrix_stride, matrix_row_stride, num_output_channels, num_input_channels);
-    Window           win;
-    auto             win_last = transform.get_window();
+    Window win;
+    auto   win_last = _transform->get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     INEKernel::configure(win);
 }
@@ -278,12 +276,14 @@
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    _transform->set_weight_tensor(_weights_hwio->buffer());
+    const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
+    _transform->set_output_matrices(_output->buffer(), _matrix_stride, matrix_row_stride);
+    _transform->set_working_space(_output->buffer());
 
-    const int        matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
-    WeightsTransform transform(reinterpret_cast<T *>(_weights_hwio->buffer()), reinterpret_cast<T *>(_output->buffer()), _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels);
-    const size_t     fst = window.x().start();
-    const size_t     lst = window.x().end();
-    transform.run(fst, lst);
+    _transform->run(fst, lst);
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -331,6 +331,12 @@
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
+{
+    return _transform->get_working_space_size(num_threads) / sizeof(T);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
     const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
 {
@@ -339,7 +345,8 @@
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
-    : _input_nhwc(), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0)
+    : _transform(nullptr), _input_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0), _padding_top(), _padding_left(),
+      _padding_right(), _padding_bottom(), _workspace(nullptr)
 {
 }
 
@@ -352,7 +359,8 @@
     const int         num_channels,  /* Number of channels in input tensor. */
     const PaddingType padding,       /* Padding type. */
     ITensor          *output,        /* Base of output matrices. */
-    const int         matrix_stride) /* Stride between output matrices. */
+    const int         matrix_stride, /* Stride between output matrices. */
+    ITensor          *workspace)
 {
     _input_nhwc    = input_nhwc;
     _num_batches   = num_batches;
@@ -362,9 +370,28 @@
     _padding       = padding;
     _output        = output;
     _matrix_stride = matrix_stride;
-    InputTransform transform(nullptr, num_batches, num_rows, num_cols, num_channels, padding, nullptr, matrix_stride, num_channels);
-    Window         win;
-    auto           win_last = transform.get_window();
+    _workspace     = workspace;
+
+    _padding_top    = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0;
+    _padding_left   = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0;
+    _padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0;
+    _padding_right  = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0;
+
+    _transform = arm_compute::support::cpp14::make_unique<InputTransform>(
+                     KernelRows,
+                     KernelCols,
+                     num_batches,
+                     num_rows,
+                     num_cols,
+                     num_channels,
+                     _padding_top,    /**< Padding to apply to the top of the image. */
+                     _padding_left,   /**< Padding to apply to the left of the image. */
+                     _padding_bottom, /**< Padding to apply to the bottom of the image. */
+                     _padding_right   /**< Padding to apply to the right of the image. */
+                 );
+
+    Window win;
+    auto   win_last = _transform->get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     INEKernel::configure(win);
 }
@@ -374,22 +401,25 @@
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
 
-    const int      element_size_in_bytes = _input_nhwc->info()->element_size();
-    const int      input_col_stride      = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
-    const int      input_row_stride      = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
-    const int      input_batch_stride    = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
-    const auto     input_nhwc_ptr        = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
-    auto           output_ptr            = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
-    InputTransform input_transform(input_nhwc_ptr,
-                                   _num_batches, _num_rows, _num_cols, _num_channels, _padding,
-                                   output_ptr,
-                                   _matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride);
+    const int  element_size_in_bytes = _input_nhwc->info()->element_size();
+    const int  input_col_stride      = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
+    const int  input_row_stride      = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
+    const int  input_batch_stride    = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
+    const auto input_nhwc_ptr        = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
+    auto       output_ptr            = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr);
+
+    _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride);
+    _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels);
+
+    _transform->set_working_space(_workspace->buffer());
 
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
     const size_t fst = window.x().start();
     const size_t lst = window.x().end();
-    input_transform.run(fst, lst);
+    _transform->run(fst, lst, info.thread_id);
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -435,11 +465,18 @@
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
-    : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0)
+    : _transform(nullptr), _biases(nullptr), _transformed_output(nullptr), _workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0),
+      _num_cols(0), _num_channels(0)
 {
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
+{
+    return _transform->get_working_space_size(num_threads) / sizeof(T);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
     const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
 {
@@ -455,28 +492,29 @@
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
     const ITensor *biases,
-    const ITensor *output_workingspace,
+    const ITensor *transformed_output,
     const int      matrix_stride,
     ITensor       *output_nhwc,
     const int      num_batches,
     const int      num_rows,
     const int      num_cols,
-    const int      num_channels)
+    const int      num_channels,
+    ITensor       *workspace)
 {
-    _biases            = biases;
-    _output_workspace  = output_workingspace;
-    _matrix_stride     = matrix_stride;
-    _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK);
-    _output_nhwc       = output_nhwc;
-    _num_batches       = num_batches;
-    _num_rows          = num_rows;
-    _num_cols          = num_cols;
-    _num_channels      = num_channels;
+    _biases             = biases;
+    _workspace          = workspace;
+    _transformed_output = transformed_output;
+    _matrix_stride      = matrix_stride;
+    _matrix_row_stride  = roundup(num_channels, WinogradConv::N_BLOCK);
+    _output_nhwc        = output_nhwc;
+    _num_batches        = num_batches;
+    _num_rows           = num_rows;
+    _num_cols           = num_cols;
+    _num_channels       = num_channels;
     // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
-    OutputTransform output_transform(nullptr, _matrix_stride, _matrix_row_stride, nullptr, nullptr, _num_batches, _num_rows, _num_cols, _num_channels);
-
+    _transform = arm_compute::support::cpp14::make_unique<OutputTransform>(num_batches, num_rows, num_cols, num_channels);
     Window win;
-    auto   win_last = output_transform.get_window();
+    auto   win_last = _transform->get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     _output_nhwc->info()->set_valid_region(ValidRegion(Coordinates(), _output_nhwc->info()->tensor_shape()));
 
@@ -488,22 +526,22 @@
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_transformed_output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
 
-    const int out_batch_stride = 0;
+    const int out_batch_stride = _output_nhwc->info()->strides_in_bytes()[3] / sizeof(T);
     const int out_row_stride   = _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
     const int out_col_stride   = _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
 
-    OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
-                                     (_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr),
-                                     reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()),
-                                     _num_batches, _num_rows, _num_cols, _num_channels, out_batch_stride, out_row_stride, out_col_stride);
-
+    _transform->set_input_matrices(_transformed_output->buffer(), _matrix_stride, _matrix_row_stride);
+    _transform->set_bias((_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr));
+    _transform->set_output_tensor(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride);
+    _transform->set_working_space(_workspace->buffer());
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
     const size_t fst = window.x().start();
     const size_t lst = window.x().end();
-    output_transform.run(fst, lst);
+    _transform->run(fst, lst, info.thread_id);
 }
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b561659..0927123 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -28,49 +28,76 @@
 #include "arm_gemm.hpp"
 
 #include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
 
+#include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_hgemm_24x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
 #include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp16_mla_4VLx4.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
 #if defined(__ARM_FEATURE_SVE)
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp16_mla_4VLx4",
+    [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<__fp16> &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
+    GemmMethod::GEMM_NATIVE,
+    "native_fp16_mla_4VLx4",
+    [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8 && args._alpha==1.0f && !args._trA && !args._trB); },
+    [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<__fp16> &args) { return new GemmNative<native_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_fp16_mla_3VLx8",
     [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
 },
 #endif
+
 #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
 {
     GemmMethod::GEMM_INTERLEAVED,
     "hgemm_24x8",
-    [](const GemmArgs<__fp16> &args) {
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        return args._ci->has_fp16();
+    [](const GemmArgs<__fp16> &args) { return args._ci->has_fp16(); },
 #else
-        return true;
+    nullptr,
 #endif
-    },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
 },
 #endif
-#if defined(__arm__)
+#ifdef __aarch64__
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sgemm_12x8",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+},
+#elif defined(__arm__)
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sgemm_8x6",
-    [](const GemmArgs<__fp16> &args) { return true; },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
 },
+#else // not AArch64 or AArch32
+# error Unknown Architecture
 #endif
 {
     GemmMethod::DEFAULT,
@@ -90,8 +117,8 @@
 template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
 template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
 template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
-template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
 
 } // namespace arm_gemm
 
-#endif // __ARM_FP16_ARGS
\ No newline at end of file
+#endif // __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 8bc33cc..6869279 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -32,6 +32,7 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a64_sgemm_native_16x4.hpp"
 #include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
@@ -112,6 +113,13 @@
     [](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
 },
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp32_mla_16x4",
+    [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "sgemm_native_16x4",
     [](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
@@ -165,6 +173,6 @@
 template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
 template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
 template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
-template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
+template std::vector<KernelDescription> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
 
-} // namespace arm_gemm
\ No newline at end of file
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c2bd0bb..82e0625 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -208,7 +208,6 @@
         return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index bf80784..d952140 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -112,8 +112,12 @@
 }
 
 template<typename Top, typename Tret>
-std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
-    std::vector<std::string> res;
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs<Tret> &args) {
+    std::vector<KernelDescription> res;
+
+    /* Find out what the default implementation in so we can set the flag accordingly later. */
+    const GemmImplementation<Top, Tret> *default_impl;
+    find_implementation(args, default_impl);
 
     auto gemms = gemm_implementation_list<Top, Tret>();
 
@@ -123,7 +127,7 @@
             continue;
         }
 
-        res.push_back(i->name);
+        res.push_back(KernelDescription(i->method, i->name, i==default_impl));
     }
 
     return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b4503dd..0db0654 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -58,7 +58,7 @@
 template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
 template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
 template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 5811c2a..9e49df1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "kernels/a64_gemm_s8_12x8.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
 #include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
 
@@ -42,6 +43,13 @@
 static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_s8s32_dot_4VLx4",
+    [](const GemmArgs<int32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "native_s8s32_dot_4VLx4",
     [](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
     [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
 },
@@ -95,7 +103,7 @@
 template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
 template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
 template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index b83ccd3..a773166 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -480,7 +480,6 @@
         return total;
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 6bcbca9..9e3e4e4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -58,7 +58,7 @@
 template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b95ca80..9321bfc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "kernels/a64_gemm_u8_12x8.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
 #include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
 
@@ -42,6 +43,13 @@
 static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_u8u32_dot_4VLx4",
+    [](const GemmArgs<uint32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "native_u8u32_dot_4VLx4",
     [](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
     [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
 },
@@ -95,7 +103,7 @@
 template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 32d668f..b7f9de8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -44,10 +44,9 @@
         _subgemm = gemm<To,Tr>(newargs);
     }
 
-    using GemmCommon<To, Tr>::set_arrays;
     void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
                     const To *B, const int ldb, const int B_multi_stride,
-                          Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
+                    Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
         /* A and C's batch stride becomes their new row stride.  New batch stride is 0 as nbatches for subgemm is always 1. */
         _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
                              B, ldb, B_multi_stride,
@@ -86,7 +85,6 @@
         return _subgemm->get_B_pretransposed_array_size();
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
     }
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index f7beb0a..21f8278 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -148,7 +148,6 @@
         return _buffer_per_multi * _nmultis * sizeof(To);
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
new file mode 100644
index 0000000..5605939
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, float, int, int, int);
+void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_16x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_16x4;
+
+    hybrid_fp32_mla_16x4(const CPUInfo *ci)
+    {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
+            kernel = a64_hybrid_fp32_mla_16x4_a55;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
new file mode 100644
index 0000000..7261761
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
@@ -0,0 +1,2352 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long blocks_count = K / 1;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            float result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+            float *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "temploadreg0 .req X0\n"
+                        "temploadreg1 .req X1\n"
+                        "temploadreg2 .req X2\n"
+                        "temploadreg3 .req X3\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "temploadreg0 .req X2\n"
+                        "temploadreg1 .req X3\n"
+                        "temploadreg2 .req X4\n"
+                        "temploadreg3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "temploadreg0 .req X4\n"
+                        "temploadreg1 .req X5\n"
+                        "temploadreg2 .req X6\n"
+                        "temploadreg3 .req X7\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "temploadreg0 .req X6\n"
+                        "temploadreg1 .req X7\n"
+                        "temploadreg2 .req X8\n"
+                        "temploadreg3 .req X9\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v28.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "fmul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr d3, [a_ptr3, #-0x10]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr s3, [a_ptr3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x4\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    );
+                    break;
+            }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
new file mode 100644
index 0000000..504769b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
@@ -0,0 +1,1726 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long blocks_count = K / 1;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            float result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+            float *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v27.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v28.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "fmul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q3, [a_ptr3, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr s3, [a_ptr3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x4\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 48bf842..17f6e57 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(int32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const int32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const int8_t *a_ptr0 = a_ptr0_base;
             const int8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            int32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+            int32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "temploadreg0 .req X0\n"
+                    "temploadreg1 .req X1\n"
+                    "temploadreg2 .req X2\n"
+                    "temploadreg3 .req X3\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "temploadreg0 .req X2\n"
+                    "temploadreg1 .req X3\n"
+                    "temploadreg2 .req X4\n"
+                    "temploadreg3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "temploadreg0 .req X4\n"
+                    "temploadreg1 .req X5\n"
+                    "temploadreg2 .req X6\n"
+                    "temploadreg3 .req X7\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d3, [a_ptr3, #0x10]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "temploadreg0 .req X6\n"
+                    "temploadreg1 .req X7\n"
+                    "temploadreg2 .req X8\n"
+                    "temploadreg3 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v28.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr d3, [a_ptr3, #-0x10]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v3.d[1], temploadreg3\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 0179139..fdd45a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(int32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const int32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const int8_t *a_ptr0 = a_ptr0_base;
             const int8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            int32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+            int32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v28.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q3, [a_ptr3, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index 230ecdc..487cfa0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(uint32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const uint32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const uint8_t *a_ptr0 = a_ptr0_base;
             const uint8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            uint32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+            uint32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "temploadreg0 .req X0\n"
+                    "temploadreg1 .req X1\n"
+                    "temploadreg2 .req X2\n"
+                    "temploadreg3 .req X3\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "temploadreg0 .req X2\n"
+                    "temploadreg1 .req X3\n"
+                    "temploadreg2 .req X4\n"
+                    "temploadreg3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "temploadreg0 .req X4\n"
+                    "temploadreg1 .req X5\n"
+                    "temploadreg2 .req X6\n"
+                    "temploadreg3 .req X7\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d3, [a_ptr3, #0x10]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "temploadreg0 .req X6\n"
+                    "temploadreg1 .req X7\n"
+                    "temploadreg2 .req X8\n"
+                    "temploadreg3 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v28.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr d3, [a_ptr3, #-0x10]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v3.d[1], temploadreg3\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index dbef029..87f46bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(uint32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const uint32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const uint8_t *a_ptr0 = a_ptr0_base;
             const uint8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            uint32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+            uint32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v28.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q3, [a_ptr3, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..c6895a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+class hybrid_fp16_mla_4VLx4
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+
+    hybrid_fp16_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..ab41fb3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3681 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 8) / 16) - 1;
+    K -= loops_count * 16;
+    const long regs_count = (K / 8) - 1;
+    K -= (regs_count + 1) * 8;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const __fp16 * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(__fp16);
+
+        __fp16 *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(__fp16);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+            const __fp16 *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const __fp16 *a_ptr0 = a_ptr0_base;
+            const __fp16 *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.h, #0\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.h, #0\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.h, #0\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.h, #0\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.h, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1h z28.h, p0/z, [c_ptr3]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z28.h, p7/m, z28.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z29.h, p7/m, z29.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z30.h, p7/m, z30.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z31.h, p7/m, z31.h, z15.h\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1h z28.h, p0, [c_ptr3]\n"
+                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000..ffd7918
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_4VLx4
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+
+    hybrid_s8s32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..673f186
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = (K + 3) / 4;
+
+    for (int y=0; y<M; y+=4) {
+        const int8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(int8_t);
+
+        int32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(int32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+            const int32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            const int8_t *a_ptr0 = a_ptr0_base;
+            const int8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000..2701a9e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_4VLx4
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+
+    hybrid_u8u32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..d34d0e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0u);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = (K + 3) / 4;
+
+    for (int y=0; y<M; y+=4) {
+        const uint8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(uint8_t);
+
+        uint32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+            const uint32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            const uint8_t *a_ptr0 = a_ptr0_base;
+            const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9..8228df4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..6cce601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+class native_fp16_mla_4VLx4
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_native_fp16_mla_4VLx4;
+
+    native_fp16_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..f1aaeb1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3821 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ldb, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const long loops_count = ((K + 8) / 16) - 1;
+    K -= loops_count * 16;
+    const long regs_count = (K / 8) - 1;
+    K -= (regs_count + 1) * 8;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const __fp16 * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(__fp16);
+
+        __fp16 *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(__fp16);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+            const __fp16 *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const __fp16 *a_ptr0 = a_ptr0_base;
+            const __fp16 *b_ptr0 = B + x0;
+            long ldbb = ldb * sizeof(__fp16);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z23.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z24.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z25.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z26.h, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z27.h, #0\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z25.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z26.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z27.h, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z28.h, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z29.h, #0\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z30.h, #0\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z31.h, #0\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1h z28.h, p0/z, [c_ptr3]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z28.h, p7/m, z28.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z29.h, p7/m, z29.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z30.h, p7/m, z30.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z31.h, p7/m, z31.h, z15.h\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1h z28.h, p0, [c_ptr3]\n"
+                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
index 9c02d95..abee1bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,63 +86,73 @@
                         "mov z19.s, #0\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "b 2f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "2:\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -530,33 +540,33 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -570,33 +580,33 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -610,33 +620,33 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -650,38 +660,38 @@
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -811,33 +821,33 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -851,33 +861,33 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -891,33 +901,33 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -931,14 +941,14 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -971,103 +981,108 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z21.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z23.s, #0\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 2f\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "sdot z20.s, z12.b, z1.b[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
                         "sdot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z16.s, z12.b, z4.b[3]\n"
                         "sdot z20.s, z12.b, z5.b[3]\n"
                         "sdot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
                         "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
                         "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
                         "sdot z20.s, z12.b, z1.b[3]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
                         "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
                         "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -2007,11 +2022,11 @@
                         "c_ptr2 .req X3\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
                         "whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z22.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z23.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z24.s, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z25.s, #0\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "mov z27.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "sdot z25.s, z13.b, z6.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z26.s, z14.b, z6.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
                         "sdot z16.s, z8.b, z4.b[2]\n"
                         "sdot z20.s, z8.b, z5.b[2]\n"
                         "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
                         "sdot z16.s, z8.b, z0.b[2]\n"
                         "sdot z20.s, z8.b, z1.b[2]\n"
                         "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -3234,15 +3255,15 @@
                         "c_ptr3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z23.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z24.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z25.s, #0\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z26.s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "mov z27.s, #0\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z28.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z28.s, #0\n"
                         "mov z29.s, #0\n"
                         "mov z30.s, #0\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "mov z31.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
                         "mul z28.s, p7/m, z28.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z29.s, p7/m, z29.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z30.s, p7/m, z30.s, z15.s\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "sdot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "sdot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "sdot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "sdot z30.s, z14.b, z7.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "sdot z30.s, z10.b, z7.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "sdot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "sdot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "sdot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
                         "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
                         "sdot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
                         "sdot z31.s, z15.b, z3.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
                         "sdot z31.s, z11.b, z3.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
                         "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
                         "sdot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
                         "sdot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
                         "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
                         "sdot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
                         "sdot z31.s, z11.b, z7.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
                         "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
                         "sdot z31.s, z11.b, z7.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
index 7d89948..cdcea59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,63 +86,73 @@
                         "mov z19.s, #0\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "b 2f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "2:\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -530,33 +540,33 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -570,33 +580,33 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -610,33 +620,33 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -650,38 +660,38 @@
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -811,33 +821,33 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -851,33 +861,33 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -891,33 +901,33 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -931,14 +941,14 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -971,103 +981,108 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z21.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z23.s, #0\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 2f\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "udot z20.s, z12.b, z1.b[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
                         "udot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z16.s, z12.b, z4.b[3]\n"
                         "udot z20.s, z12.b, z5.b[3]\n"
                         "udot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
                         "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
                         "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
                         "udot z20.s, z12.b, z1.b[3]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
                         "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
                         "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -2007,11 +2022,11 @@
                         "c_ptr2 .req X3\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
                         "whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z22.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z23.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z24.s, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z25.s, #0\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "mov z27.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "udot z25.s, z13.b, z6.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z26.s, z14.b, z6.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
                         "udot z16.s, z8.b, z4.b[2]\n"
                         "udot z20.s, z8.b, z5.b[2]\n"
                         "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
                         "udot z16.s, z8.b, z0.b[2]\n"
                         "udot z20.s, z8.b, z1.b[2]\n"
                         "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -3234,15 +3255,15 @@
                         "c_ptr3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z23.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z24.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z25.s, #0\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z26.s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "mov z27.s, #0\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z28.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z28.s, #0\n"
                         "mov z29.s, #0\n"
                         "mov z30.s, #0\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "mov z31.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
                         "mul z28.s, p7/m, z28.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z29.s, p7/m, z29.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z30.s, p7/m, z30.s, z15.s\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "udot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "udot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "udot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "udot z30.s, z14.b, z7.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "udot z30.s, z10.b, z7.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "udot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "udot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "udot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
                         "udot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
                         "udot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
                         "udot z31.s, z15.b, z3.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
                         "udot z31.s, z11.b, z3.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
                         "udot z31.s, z15.b, z3.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
                         "udot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
                         "udot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
                         "udot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
                         "udot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
                         "udot z31.s, z11.b, z7.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
                         "udot z31.s, z15.b, z7.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
                         "udot z31.s, z11.b, z7.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
new file mode 100644
index 0000000..b50067b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cstring>
+#include <cstdint>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
+namespace padding
+{
+
+template <typename T>
+void copy_and_pad_tile(
+  const unsigned int tile_rows,
+  const unsigned int tile_cols,
+  const unsigned int n_channels,
+  const T* const inptr,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  T* const outptr,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride,
+  const unsigned int pad_top,
+  const unsigned int pad_left,
+  const unsigned int pad_bottom,
+  const unsigned int pad_right,
+  const T pad_value
+)
+{
+  for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
+  {
+    for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
+    {
+      T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
+
+      if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
+          out_j < pad_left || tile_cols - pad_right <= out_j)
+      {
+        for (unsigned int n = 0; n < n_channels; n++)
+        {
+          output[n] = pad_value;
+        }
+      }
+      else
+      {
+        const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
+        const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
+        std::memcpy(output, input, n_channels * sizeof(T));
+      }
+    }
+  }
+}
+
+template void copy_and_pad_tile(
+  unsigned int, unsigned int, unsigned int,
+  const uint8_t *, unsigned int, unsigned int,
+  uint8_t *, unsigned int, unsigned int,
+  unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
+);
+
+template void copy_and_pad_tile(
+  unsigned int, unsigned int, unsigned int,
+  const float *, unsigned int, unsigned int,
+  float *, unsigned int, unsigned int,
+  unsigned int, unsigned int, unsigned int, unsigned int, float
+);
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template void copy_and_pad_tile(
+  unsigned int, unsigned int, unsigned int,
+  const float16_t *, unsigned int, unsigned int,
+  float16_t *, unsigned int, unsigned int,
+  unsigned int, unsigned int, unsigned int, unsigned int, float16_t
+);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <unsigned int TileRows, unsigned int TileCols>
+void CopyCropped<TileRows, TileCols>::execute(
+  const size_t size,
+  const void * const inptr,
+  const size_t in_row_stride,
+  const size_t in_col_stride,
+  void * const outptr,
+  const size_t out_row_stride,
+  const size_t out_col_stride,
+  const unsigned int pad_top,
+  const unsigned int pad_left,
+  const unsigned int pad_bottom,
+  const unsigned int pad_right
+)
+{
+  for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
+  {
+    for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
+    {
+      std::memcpy(
+        static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
+        static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
+        size
+      );
+    }
+  }
+}
+
+template class CopyCropped<2, 2>;
+template class CopyCropped<3, 3>;
+template class CopyCropped<4, 4>;
+
+}  // namespace padding
diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
new file mode 100644
index 0000000..1de9ebf
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+
+namespace qasymm8
+{
+#if(__ANDROID__ || BARE_METAL)
+template <typename T> T round(T val) {  return ::round(val); }
+template <typename T> T exp2(T val) { return ::exp2(val); }
+template <typename T> T log2(T val) { return ::log2(val); }
+#else  /* (__ANDROID__ || BARE_METAL) */
+template <typename T> T round(T val) { return std::round(val); }
+template <typename T> T exp2(T val) { return std::exp2(val); }
+template <typename T> T log2(T val) { return std::log2(val); }
+#endif  /* (__ANDROID__ || BARE_METAL) */
+
+uint8_t QAsymm8Params::quantize(const float value) const
+{
+  const float transformed = value / scale + offset;
+  return static_cast<uint8_t>(round(std::max(0.0f, std::min(255.0f, transformed))));
+}
+
+float QAsymm8Params::dequantize(const uint8_t value) const
+{
+  return scale * (static_cast<float>(value) - offset);
+}
+
+QAsymm8RescaleParams QAsymm8RescaleParams::make_rescale_params(
+  const QAsymm8Params& weight_quant,
+  const QAsymm8Params& input_quant,
+  const QAsymm8Params& output_quant
+)
+{
+  // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc
+  const float rescale = weight_quant.scale * input_quant.scale / output_quant.scale;
+  const float shiftf = round(log2(0.5f / rescale));
+  const float multf = exp2(31.0f + shiftf)*rescale;
+
+  int64_t shift = static_cast<int64_t>(shiftf);
+  int64_t mult = static_cast<int64_t>(multf);
+
+  if (mult == (1ll << 31))
+  {
+    mult /= 2;
+    shift--;
+  }
+
+  assert(shift >= 0);
+  assert(mult <= std::numeric_limits<int32_t>::max());
+
+  return QAsymm8RescaleParams(
+    static_cast<int32_t>(shift),
+    static_cast<int32_t>(mult),
+    rescale
+  );
+}
+
+QAsymm8RescaleParams::QAsymm8RescaleParams(int32_t shift, int32_t multi, float rescale)
+  : shift(shift), multiplier(multi), rescale(rescale)
+{
+}
+}
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index ca1de26..1272754 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,546 +25,1144 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
 
 #ifdef __aarch64__
-
 template <>
 template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
-  const int n_channels,
-  const float* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
 )
 {
-  // Copy pointers
-  const float *uptr0 = inptr;
-  const float *wptr0 = weights;
-  float *vptr0 = outptr;
+  __asm __volatile(
+    "add x26, %[inptr0], %[input_row_stride]\n"
+    "add x21, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x23, %[outptr0], %[output_row_stride]\n"
+    "add x27, x26, %[input_row_stride]\n"
+    "add x22, x21, %[input_col_stride1]\n"
+    "and x24, %[n_channels], #3\n"
+    "add x28, x27, %[input_row_stride]\n"
+    "lsr x25, %[n_channels], #2\n"
+    "cbz x25, 4f\n"
+    "1:\n"
+    "ldr q15, [%[wbptr]]\n"
+    "subs x25, x25, #1\n"
+    "mov v3.16b, v15.16b\n"
+    "ldr q14, [%[wbptr], #16]\n"
+    "mov v1.16b, v15.16b\n"
+    "ldr q13, [%[wbptr], #32]\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr q12, [%[wbptr], #48]\n"
+    "mov v0.16b, v15.16b\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "ldr q9, [%[wbptr], #96]\n"
+    "ldr q8, [%[wbptr], #112]\n"
+    "ldr q7, [%[wbptr], #128]\n"
+    "ldr q6, [%[wbptr], #144]\n"
+    "ldr q24, [%[inptr0]]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "ldr q22, [x26]\n"
+    "fmla v1.4s, v22.4s, v14.4s\n"
+    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v19.4s, v14.4s\n"
+    "ldr q18, [x27]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "ldr q21, [x26, %[input_col_stride1]]\n"
+    "fmla v1.4s, v18.4s, v11.4s\n"
+    "ldr q17, [%[inptr0], x21]\n"
+    "ldr q20, [x28]\n"
+    "ldr q5, [x27, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v13.4s\n"
+    "fmla v3.4s, v18.4s, v8.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v3.4s, v21.4s, v10.4s\n"
+    "ldr q19, [x26, x21]\n"
+    "fmla v1.4s, v21.4s, v13.4s\n"
+    "ldr q23, [%[inptr0], x22]\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "ldr q22, [x28, %[input_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v14.4s\n"
+    "ldr q21, [x27, x21]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr q18, [x26, x22]\n"
+    "fmla v2.4s, v17.4s, v13.4s\n"
+    "ldr q16, [x28, x21]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "ldr q20, [x27, x22]\n"
+    "fmla v3.4s, v5.4s, v7.4s\n"
+    "ldr q4, [x28, x22]\n"
+    "fmla v2.4s, v5.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v1.4s, v5.4s, v10.4s\n"
+    "ldr q15, [%[wbptr]]\n"
+    "fmla v0.4s, v5.4s, v11.4s\n"
+    "ldr q14, [%[wbptr], #16]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v1.4s, v19.4s, v12.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v2.4s, v19.4s, v10.4s\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "fmla v0.4s, v19.4s, v13.4s\n"
+    "ldr q24, [%[inptr0]]\n"
+    "fmla v1.4s, v22.4s, v7.4s\n"
+    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "ldr q17, [%[inptr0], x21]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "ldr q13, [%[wbptr], #32]\n"
+    "fmla v3.4s, v21.4s, v6.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v1.4s, v21.4s, v9.4s\n"
+    "ldr q22, [x26]\n"
+    "fmla v2.4s, v21.4s, v7.4s\n"
+    "ldr q8, [%[wbptr], #112]\n"
+    "str q3, [%[outptr0]]\n"
+    "fmla v0.4s, v21.4s, v10.4s\n"
+    "fmla v1.4s, v16.4s, v6.4s\n"
+    "ldr q21, [x26, %[input_col_stride1]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v0.4s, v18.4s, v12.4s\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "str q1, [x23]\n"
+    "mov v3.16b, v15.16b\n"
+    "fmla v2.4s, v20.4s, v6.4s\n"
+    "ldr q18, [x27]\n"
+    "fmla v0.4s, v16.4s, v7.4s\n"
+    "ldr q12, [%[wbptr], #48]\n"
+    "mov v1.16b, v15.16b\n"
+    "ldr q5, [x27, %[input_col_stride1]]\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "fmla v0.4s, v20.4s, v9.4s\n"
+    "ldr q7, [%[wbptr], #128]\n"
+    "mov v2.16b, v15.16b\n"
+    "add x28, x28, #16\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "ldr q20, [x28]\n"
+    "fmla v0.4s, v4.4s, v6.4s\n"
+    "ldr q9, [%[wbptr], #96]\n"
+    "fmla v1.4s, v22.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v3.4s, v19.4s, v13.4s\n"
+    "subs x25, x25, #1\n"
+    "str q0, [x23, %[output_col_stride1]]\n"
+    "fmla v2.4s, v19.4s, v14.4s\n"
+    "ldr q6, [%[wbptr], #144]\n"
+    "add x23, x23, #16\n"
+    "fmla v3.4s, v18.4s, v8.4s\n"
+    "fmla v1.4s, v18.4s, v11.4s\n"
+    "mov v0.16b, v15.16b\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v3.4s, v21.4s, v10.4s\n"
+    "ldr q19, [x26, x21]\n"
+    "fmla v1.4s, v21.4s, v13.4s\n"
+    "ldr q23, [%[inptr0], x22]\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "ldr q22, [x28, %[input_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v14.4s\n"
+    "ldr q21, [x27, x21]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr q18, [x26, x22]\n"
+    "fmla v2.4s, v17.4s, v13.4s\n"
+    "ldr q16, [x28, x21]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "ldr q20, [x27, x22]\n"
+    "fmla v3.4s, v5.4s, v7.4s\n"
+    "ldr q4, [x28, x22]\n"
+    "fmla v2.4s, v5.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v1.4s, v5.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v5.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v1.4s, v19.4s, v12.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v2.4s, v19.4s, v10.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v0.4s, v19.4s, v13.4s\n"
+    "fmla v3.4s, v21.4s, v6.4s\n"
+    "fmla v1.4s, v22.4s, v7.4s\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "str q3, [%[outptr0]]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "fmla v1.4s, v21.4s, v9.4s\n"
+    "fmla v2.4s, v21.4s, v7.4s\n"
+    "fmla v0.4s, v21.4s, v10.4s\n"
+    "fmla v1.4s, v16.4s, v6.4s\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "fmla v0.4s, v18.4s, v12.4s\n"
+    "str q1, [x23]\n"
+    "fmla v2.4s, v20.4s, v6.4s\n"
+    "fmla v0.4s, v16.4s, v7.4s\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v20.4s, v9.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v0.4s, v4.4s, v6.4s\n"
+    "str q0, [x23, %[output_col_stride1]]\n"
+    "add x23, x23, #16\n"
+    "4:\n"
+    "cbz x24, 7f\n"
+    "ldr s15, [%[wbptr]]\n"
+    "mov v3.16b, v15.16b\n"
+    "ldr s14, [%[wbptr], #4]\n"
+    "mov v1.16b, v15.16b\n"
+    "ldr s13, [%[wbptr], #8]\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr s12, [%[wbptr], #12]\n"
+    "mov v0.16b, v15.16b\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "subs x24, x24, #1\n"
+    "ldr s9, [%[wbptr], #24]\n"
+    "ldr s8, [%[wbptr], #28]\n"
+    "ldr s7, [%[wbptr], #32]\n"
+    "ldr s6, [%[wbptr], #36]\n"
+    "ldr s24, [%[inptr0]]\n"
+    "ldr s22, [x26]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v1.4s, v22.4s, v14.4s\n"
+    "ldr s18, [x27]\n"
+    "fmla v2.4s, v19.4s, v14.4s\n"
+    "ldr s21, [x26, %[input_col_stride1]]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "ldr s17, [%[inptr0], x21]\n"
+    "fmla v1.4s, v18.4s, v11.4s\n"
+    "ldr s20, [x28]\n"
+    "ldr s5, [x27, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v13.4s\n"
+    "fmla v3.4s, v18.4s, v8.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v3.4s, v21.4s, v10.4s\n"
+    "ldr s19, [x26, x21]\n"
+    "fmla v1.4s, v21.4s, v13.4s\n"
+    "ldr s23, [%[inptr0], x22]\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "ldr s22, [x28, %[input_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v14.4s\n"
+    "ldr s21, [x27, x21]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr s18, [x26, x22]\n"
+    "fmla v2.4s, v17.4s, v13.4s\n"
+    "ldr s16, [x28, x21]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "ldr s20, [x27, x22]\n"
+    "fmla v3.4s, v5.4s, v7.4s\n"
+    "ldr s4, [x28, x22]\n"
+    "fmla v2.4s, v5.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v1.4s, v5.4s, v10.4s\n"
+    "ldr s15, [%[wbptr]]\n"
+    "fmla v0.4s, v5.4s, v11.4s\n"
+    "ldr s14, [%[wbptr], #4]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v1.4s, v19.4s, v12.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v2.4s, v19.4s, v10.4s\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "fmla v0.4s, v19.4s, v13.4s\n"
+    "ldr s24, [%[inptr0]]\n"
+    "fmla v1.4s, v22.4s, v7.4s\n"
+    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "ldr s17, [%[inptr0], x21]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "ldr s13, [%[wbptr], #8]\n"
+    "fmla v3.4s, v21.4s, v6.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v1.4s, v21.4s, v9.4s\n"
+    "ldr s22, [x26]\n"
+    "fmla v2.4s, v21.4s, v7.4s\n"
+    "ldr s8, [%[wbptr], #28]\n"
+    "str s3, [%[outptr0]]\n"
+    "fmla v0.4s, v21.4s, v10.4s\n"
+    "fmla v1.4s, v16.4s, v6.4s\n"
+    "ldr s21, [x26, %[input_col_stride1]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v0.4s, v18.4s, v12.4s\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "str s1, [x23]\n"
+    "mov v3.16b, v15.16b\n"
+    "fmla v2.4s, v20.4s, v6.4s\n"
+    "ldr s18, [x27]\n"
+    "fmla v0.4s, v16.4s, v7.4s\n"
+    "ldr s12, [%[wbptr], #12]\n"
+    "mov v1.16b, v15.16b\n"
+    "ldr s5, [x27, %[input_col_stride1]]\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "fmla v0.4s, v20.4s, v9.4s\n"
+    "ldr s7, [%[wbptr], #32]\n"
+    "mov v2.16b, v15.16b\n"
+    "add x28, x28, #4\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "ldr s20, [x28]\n"
+    "fmla v0.4s, v4.4s, v6.4s\n"
+    "ldr s9, [%[wbptr], #24]\n"
+    "fmla v1.4s, v22.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v3.4s, v19.4s, v13.4s\n"
+    "subs x24, x24, #1\n"
+    "str s0, [x23, %[output_col_stride1]]\n"
+    "fmla v2.4s, v19.4s, v14.4s\n"
+    "ldr s6, [%[wbptr], #36]\n"
+    "add x23, x23, #4\n"
+    "fmla v3.4s, v18.4s, v8.4s\n"
+    "fmla v1.4s, v18.4s, v11.4s\n"
+    "mov v0.16b, v15.16b\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v3.4s, v21.4s, v10.4s\n"
+    "ldr s19, [x26, x21]\n"
+    "fmla v1.4s, v21.4s, v13.4s\n"
+    "ldr s23, [%[inptr0], x22]\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "ldr s22, [x28, %[input_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v14.4s\n"
+    "ldr s21, [x27, x21]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr s18, [x26, x22]\n"
+    "fmla v2.4s, v17.4s, v13.4s\n"
+    "ldr s16, [x28, x21]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "ldr s20, [x27, x22]\n"
+    "fmla v3.4s, v5.4s, v7.4s\n"
+    "ldr s4, [x28, x22]\n"
+    "fmla v2.4s, v5.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v1.4s, v5.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v5.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v1.4s, v19.4s, v12.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v2.4s, v19.4s, v10.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v0.4s, v19.4s, v13.4s\n"
+    "fmla v3.4s, v21.4s, v6.4s\n"
+    "fmla v1.4s, v22.4s, v7.4s\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "str s3, [%[outptr0]]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "fmla v1.4s, v21.4s, v9.4s\n"
+    "fmla v2.4s, v21.4s, v7.4s\n"
+    "fmla v0.4s, v21.4s, v10.4s\n"
+    "fmla v1.4s, v16.4s, v6.4s\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "fmla v0.4s, v18.4s, v12.4s\n"
+    "str s1, [x23]\n"
+    "fmla v2.4s, v20.4s, v6.4s\n"
+    "fmla v0.4s, v16.4s, v7.4s\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v20.4s, v9.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v0.4s, v4.4s, v6.4s\n"
+    "str s0, [x23, %[output_col_stride1]]\n"
+    "add x23, x23, #4\n"
+    "7:\n"
+    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-  int channels_remaining = n_channels;
-  if (channels_remaining >= 4)
-  {
-    // Process blocks of 4 channels at a time
-    int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
-    const bool odd_tail = (channels_remaining / 4) & 1;
-    channels_remaining %= 4;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x21, %[inptr0], %[input_row_stride]\n"
+    "add x24, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x22, %[outptr0], %[output_row_stride]\n"
+    "add x23, x21, %[input_row_stride]\n"
+    "add x27, x24, %[input_col_stride1]\n"
+    "and x25, %[n_channels], #3\n"
+    "add x28, x23, %[input_row_stride]\n"
+    "lsr x26, %[n_channels], #2\n"
+    "cbz x26, 4f\n"
+    "1:\n"
+    "ldr q11, [%[wbptr]]\n"
+    "subs x26, x26, #1\n"
+    "mov v17.16b, v11.16b\n"
+    "ldr q13, [%[wbptr], #16]\n"
+    "mov v15.16b, v11.16b\n"
+    "ldr q4, [%[wbptr], #32]\n"
+    "mov v16.16b, v11.16b\n"
+    "ldr q2, [%[wbptr], #48]\n"
+    "mov v14.16b, v11.16b\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "ldr q1, [%[wbptr], #96]\n"
+    "ldr q12, [%[wbptr], #112]\n"
+    "ldr q0, [%[wbptr], #128]\n"
+    "ldr q3, [%[wbptr], #144]\n"
+    "ldr q6, [%[inptr0]]\n"
+    "fmla v17.4s, v6.4s, v13.4s\n"
+    "ldr q27, [x21]\n"
+    "fmla v15.4s, v27.4s, v13.4s\n"
+    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "ldr q24, [x23]\n"
+    "fmla v17.4s, v27.4s, v5.4s\n"
+    "ldr q22, [x21, %[input_col_stride1]]\n"
+    "ldr q9, [%[inptr0], x24]\n"
+    "ldr q8, [x28]\n"
+    "ldr q20, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v17.4s, v24.4s, v12.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "fmla v15.4s, v24.4s, v5.4s\n"
+    "ldr q27, [%[inptr0], x27]\n"
+    "fmla v16.4s, v22.4s, v5.4s\n"
+    "ldr q25, [x28, %[input_col_stride1]]\n"
+    "fmla v17.4s, v22.4s, v10.4s\n"
+    "ldr q24, [x23, x24]\n"
+    "fmla v15.4s, v22.4s, v4.4s\n"
+    "ldr q21, [x21, x27]\n"
+    "fmla v14.4s, v22.4s, v13.4s\n"
+    "ldr q7, [x28, x24]\n"
+    "fmla v17.4s, v9.4s, v2.4s\n"
+    "ldr q19, [x23, x27]\n"
+    "fmla v16.4s, v9.4s, v4.4s\n"
+    "ldr q18, [x28, x27]\n"
+    "fmla v15.4s, v8.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v20.4s, v0.4s\n"
+    "ldr q11, [%[wbptr]]\n"
+    "fmla v16.4s, v20.4s, v12.4s\n"
+    "ldr q13, [%[wbptr], #16]\n"
+    "fmla v15.4s, v20.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v14.4s, v20.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v17.4s, v26.4s, v1.4s\n"
+    "ldr q6, [%[inptr0]]\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr q9, [%[inptr0], x24]\n"
+    "fmla v15.4s, v25.4s, v0.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v16.4s, v27.4s, v2.4s\n"
+    "ldr q27, [x21]\n"
+    "fmla v14.4s, v25.4s, v12.4s\n"
+    "ldr q4, [%[wbptr], #32]\n"
+    "fmla v17.4s, v24.4s, v3.4s\n"
+    "ldr q22, [x21, %[input_col_stride1]]\n"
+    "fmla v15.4s, v24.4s, v1.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v16.4s, v24.4s, v0.4s\n"
+    "ldr q12, [%[wbptr], #112]\n"
+    "fmla v14.4s, v24.4s, v10.4s\n"
+    "ldr q24, [x23]\n"
+    "fmla v15.4s, v7.4s, v3.4s\n"
+    "ldr q20, [x23, %[input_col_stride1]]\n"
+    "fmla v16.4s, v21.4s, v1.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v14.4s, v21.4s, v2.4s\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "movi v26.16b, #0\n"
+    "ldr q8, [x28]\n"
+    "fmla v16.4s, v19.4s, v3.4s\n"
+    "subs x26, x26, #1\n"
+    "fmla v14.4s, v7.4s, v0.4s\n"
+    "ldr q2, [%[wbptr], #48]\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str q17, [%[outptr0]]\n"
+    "str q16, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v14.4s, v19.4s, v1.4s\n"
+    "str q15, [x22]\n"
+    "mov v17.16b, v11.16b\n"
+    "mov v15.16b, v11.16b\n"
+    "ldr q0, [%[wbptr], #128]\n"
+    "fmla v14.4s, v18.4s, v3.4s\n"
+    "ldr q1, [%[wbptr], #96]\n"
+    "mov v16.16b, v11.16b\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v17.4s, v6.4s, v13.4s\n"
+    "fmla v15.4s, v27.4s, v13.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "ldr q3, [%[wbptr], #144]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "str q14, [x22, %[output_col_stride1]]\n"
+    "mov v14.16b, v11.16b\n"
+    "add x22, x22, #16\n"
+    "fmla v17.4s, v27.4s, v5.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v17.4s, v24.4s, v12.4s\n"
+    "ldr q26, [x21, x24]\n"
+    "fmla v15.4s, v24.4s, v5.4s\n"
+    "ldr q27, [%[inptr0], x27]\n"
+    "fmla v16.4s, v22.4s, v5.4s\n"
+    "ldr q25, [x28, %[input_col_stride1]]\n"
+    "fmla v17.4s, v22.4s, v10.4s\n"
+    "ldr q24, [x23, x24]\n"
+    "fmla v15.4s, v22.4s, v4.4s\n"
+    "ldr q21, [x21, x27]\n"
+    "fmla v14.4s, v22.4s, v13.4s\n"
+    "ldr q7, [x28, x24]\n"
+    "fmla v17.4s, v9.4s, v2.4s\n"
+    "ldr q19, [x23, x27]\n"
+    "fmla v16.4s, v9.4s, v4.4s\n"
+    "ldr q18, [x28, x27]\n"
+    "fmla v15.4s, v8.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v20.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v20.4s, v12.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v15.4s, v20.4s, v10.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v14.4s, v20.4s, v5.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v17.4s, v26.4s, v1.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v17.4s, v24.4s, v3.4s\n"
+    "fmla v16.4s, v27.4s, v2.4s\n"
+    "fmla v15.4s, v25.4s, v0.4s\n"
+    "fmla v14.4s, v25.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmla v16.4s, v24.4s, v0.4s\n"
+    "str q17, [%[outptr0]]\n"
+    "fmla v15.4s, v24.4s, v1.4s\n"
+    "fmla v14.4s, v24.4s, v10.4s\n"
+    "fmla v16.4s, v21.4s, v1.4s\n"
+    "fmla v15.4s, v7.4s, v3.4s\n"
+    "fmla v14.4s, v21.4s, v2.4s\n"
+    "fmla v16.4s, v19.4s, v3.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmla v14.4s, v7.4s, v0.4s\n"
+    "str q15, [x22]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmla v14.4s, v19.4s, v1.4s\n"
+    "str q16, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v14.4s, v18.4s, v3.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "str q14, [x22, %[output_col_stride1]]\n"
+    "add x22, x22, #16\n"
+    "4:\n"
+    "cbz x25, 7f\n"
+    "ldr s11, [%[wbptr]]\n"
+    "mov v17.16b, v11.16b\n"
+    "ldr s13, [%[wbptr], #4]\n"
+    "mov v15.16b, v11.16b\n"
+    "ldr s4, [%[wbptr], #8]\n"
+    "mov v16.16b, v11.16b\n"
+    "ldr s2, [%[wbptr], #12]\n"
+    "mov v14.16b, v11.16b\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "subs x25, x25, #1\n"
+    "ldr s1, [%[wbptr], #24]\n"
+    "ldr s12, [%[wbptr], #28]\n"
+    "ldr s0, [%[wbptr], #32]\n"
+    "ldr s3, [%[wbptr], #36]\n"
+    "ldr s6, [%[inptr0]]\n"
+    "ldr s27, [x21]\n"
+    "fmla v17.4s, v6.4s, v13.4s\n"
+    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v15.4s, v27.4s, v13.4s\n"
+    "ldr s24, [x23]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "ldr s22, [x21, %[input_col_stride1]]\n"
+    "fmla v17.4s, v27.4s, v5.4s\n"
+    "ldr s9, [%[inptr0], x24]\n"
+    "ldr s8, [x28]\n"
+    "ldr s20, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v17.4s, v24.4s, v12.4s\n"
+    "ldr s26, [x21, x24]\n"
+    "fmla v15.4s, v24.4s, v5.4s\n"
+    "ldr s27, [%[inptr0], x27]\n"
+    "fmla v16.4s, v22.4s, v5.4s\n"
+    "ldr s25, [x28, %[input_col_stride1]]\n"
+    "fmla v17.4s, v22.4s, v10.4s\n"
+    "ldr s24, [x23, x24]\n"
+    "fmla v15.4s, v22.4s, v4.4s\n"
+    "ldr s21, [x21, x27]\n"
+    "fmla v14.4s, v22.4s, v13.4s\n"
+    "ldr s7, [x28, x24]\n"
+    "fmla v17.4s, v9.4s, v2.4s\n"
+    "ldr s19, [x23, x27]\n"
+    "fmla v16.4s, v9.4s, v4.4s\n"
+    "ldr s18, [x28, x27]\n"
+    "fmla v15.4s, v8.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v20.4s, v0.4s\n"
+    "ldr s11, [%[wbptr]]\n"
+    "fmla v16.4s, v20.4s, v12.4s\n"
+    "ldr s13, [%[wbptr], #4]\n"
+    "fmla v15.4s, v20.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v14.4s, v20.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v17.4s, v26.4s, v1.4s\n"
+    "ldr s6, [%[inptr0]]\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr s9, [%[inptr0], x24]\n"
+    "fmla v15.4s, v25.4s, v0.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v16.4s, v27.4s, v2.4s\n"
+    "ldr s27, [x21]\n"
+    "fmla v14.4s, v25.4s, v12.4s\n"
+    "ldr s4, [%[wbptr], #8]\n"
+    "fmla v17.4s, v24.4s, v3.4s\n"
+    "ldr s22, [x21, %[input_col_stride1]]\n"
+    "fmla v15.4s, v24.4s, v1.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v16.4s, v24.4s, v0.4s\n"
+    "ldr s12, [%[wbptr], #28]\n"
+    "fmla v14.4s, v24.4s, v10.4s\n"
+    "ldr s24, [x23]\n"
+    "fmla v15.4s, v7.4s, v3.4s\n"
+    "ldr s20, [x23, %[input_col_stride1]]\n"
+    "fmla v16.4s, v21.4s, v1.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v14.4s, v21.4s, v2.4s\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "movi v26.16b, #0\n"
+    "ldr s8, [x28]\n"
+    "fmla v16.4s, v19.4s, v3.4s\n"
+    "subs x25, x25, #1\n"
+    "fmla v14.4s, v7.4s, v0.4s\n"
+    "ldr s2, [%[wbptr], #12]\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str s17, [%[outptr0]]\n"
+    "str s16, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v14.4s, v19.4s, v1.4s\n"
+    "str s15, [x22]\n"
+    "mov v17.16b, v11.16b\n"
+    "mov v15.16b, v11.16b\n"
+    "ldr s0, [%[wbptr], #32]\n"
+    "fmla v14.4s, v18.4s, v3.4s\n"
+    "ldr s1, [%[wbptr], #24]\n"
+    "mov v16.16b, v11.16b\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v17.4s, v6.4s, v13.4s\n"
+    "fmla v15.4s, v27.4s, v13.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "ldr s3, [%[wbptr], #36]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "str s14, [x22, %[output_col_stride1]]\n"
+    "mov v14.16b, v11.16b\n"
+    "add x22, x22, #4\n"
+    "fmla v17.4s, v27.4s, v5.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v17.4s, v24.4s, v12.4s\n"
+    "ldr s26, [x21, x24]\n"
+    "fmla v15.4s, v24.4s, v5.4s\n"
+    "ldr s27, [%[inptr0], x27]\n"
+    "fmla v16.4s, v22.4s, v5.4s\n"
+    "ldr s25, [x28, %[input_col_stride1]]\n"
+    "fmla v17.4s, v22.4s, v10.4s\n"
+    "ldr s24, [x23, x24]\n"
+    "fmla v15.4s, v22.4s, v4.4s\n"
+    "ldr s21, [x21, x27]\n"
+    "fmla v14.4s, v22.4s, v13.4s\n"
+    "ldr s7, [x28, x24]\n"
+    "fmla v17.4s, v9.4s, v2.4s\n"
+    "ldr s19, [x23, x27]\n"
+    "fmla v16.4s, v9.4s, v4.4s\n"
+    "ldr s18, [x28, x27]\n"
+    "fmla v15.4s, v8.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v20.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v20.4s, v12.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v15.4s, v20.4s, v10.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v14.4s, v20.4s, v5.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v17.4s, v26.4s, v1.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v17.4s, v24.4s, v3.4s\n"
+    "fmla v16.4s, v27.4s, v2.4s\n"
+    "fmla v15.4s, v25.4s, v0.4s\n"
+    "fmla v14.4s, v25.4s, v12.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmla v16.4s, v24.4s, v0.4s\n"
+    "str s17, [%[outptr0]]\n"
+    "fmla v15.4s, v24.4s, v1.4s\n"
+    "fmla v14.4s, v24.4s, v10.4s\n"
+    "fmla v16.4s, v21.4s, v1.4s\n"
+    "fmla v15.4s, v7.4s, v3.4s\n"
+    "fmla v14.4s, v21.4s, v2.4s\n"
+    "fmla v16.4s, v19.4s, v3.4s\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "fmla v14.4s, v7.4s, v0.4s\n"
+    "str s15, [x22]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "fmla v14.4s, v19.4s, v1.4s\n"
+    "str s16, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v14.4s, v18.4s, v3.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "str s14, [x22, %[output_col_stride1]]\n"
+    "add x22, x22, #4\n"
+    "7:\n"
+    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+    : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-    asm volatile (
-      "qW11B .req q0\n" "vW11B .req v0\n" "qW33A .req q1\n" "qU32B .req q1\n"
-      "vW33A .req v1\n" "vU32B .req v1\n" "qU44B .req q2\n" "qW21A .req q2\n"
-      "vU44B .req v2\n" "vW21A .req v2\n" "qU21B .req q3\n" "qU32A .req q3\n"
-      "vU21B .req v3\n" "vU32A .req v3\n" "qU43A .req q4\n" "qV21B .req q4\n"
-      "vU43A .req v4\n" "vV21B .req v4\n" "qU24A .req q5\n" "qU44A .req q5\n"
-      "qU33B .req q5\n" "vU24A .req v5\n" "vU44A .req v5\n" "vU33B .req v5\n"
-      "qU31A .req q6\n" "qV12B .req q6\n" "qU23A .req q6\n" "vU31A .req v6\n"
-      "vV12B .req v6\n" "vU23A .req v6\n" "qW31B .req q7\n" "qV22A .req q7\n"
-      "vW31B .req v7\n" "vV22A .req v7\n" "qV12A .req q8\n" "qW21B .req q8\n"
-      "vV12A .req v8\n" "vW21B .req v8\n" "qU22B .req q9\n" "qU34A .req q9\n"
-      "vU22B .req v9\n" "vU34A .req v9\n" "qU13B .req q10\n" "qU13A .req q10\n"
-      "vU13B .req v10\n" "vU13A .req v10\n" "qU34B .req q11\n" "qU22A .req q11\n"
-      "vU34B .req v11\n" "vU22A .req v11\n" "qU24B .req q12\n" "qU31B .req q12\n"
-      "vU24B .req v12\n" "vU31B .req v12\n" "qW12B .req q13\n" "qW13A .req q13\n"
-      "vW12B .req v13\n" "vW13A .req v13\n" "qV21A .req q14\n" "qV11B .req q14\n"
-      "vV21A .req v14\n" "vV11B .req v14\n" "qW32A .req q15\n" "qW32B .req q15\n"
-      "vW32A .req v15\n" "vW32B .req v15\n" "qW31A .req q16\n" "qV22B .req q16\n"
-      "vW31A .req v16\n" "vV22B .req v16\n"
-      "qW11A .req q17\n" "vW11A .req v17\n" "qW13B .req q18\n" "qU14A .req q18\n"
-      "vW13B .req v18\n" "vU14A .req v18\n" "qU33A .req q19\n" "qW33B .req q19\n"
-      "vU33A .req v19\n" "vW33B .req v19\n" "qW22A .req q20\n" "qU23B .req q20\n"
-      "vW22A .req v20\n" "vU23B .req v20\n" "qU12A .req q21\n" "qU42A .req q21\n"
-      "vU12A .req v21\n" "vU42A .req v21\n" "qU41A .req q22\n" "qU42B .req q22\n"
-      "vU41A .req v22\n" "vU42B .req v22\n" "qW23A .req q23\n" "qW23B .req q23\n"
-      "vW23A .req v23\n" "vW23B .req v23\n" "qU43B .req q24\n" "qU11A .req q24\n"
-      "vU43B .req v24\n" "vU11A .req v24\n" "qU12B .req q25\n" "qW12A .req q25\n"
-      "vU12B .req v25\n" "vW12A .req v25\n" "qU41B .req q26\n" "qV11A .req q26\n"
-      "vU41B .req v26\n" "vV11A .req v26\n" "qW22B .req q27\n" "vW22B .req v27\n"
-      "qU11B .req q28\n" "qU14B .req q28\n" "vU11B .req v28\n" "vU14B .req v28\n"
-      "qU21A .req q29\n" "vU21A .req v29\n"
-
-      "u_col_stride1 .req %x[u_col_stride]\n"
-      "u_col_stride2 .req x0\n"
-      "u_col_stride3 .req x1\n"
-      "uptr1 .req x2\n"
-      "uptr2 .req x3\n"
-      "uptr3 .req x4\n"
-      "wptr1 .req x5\n"
-      "wptr2 .req x6\n"
-      "vptr1 .req x7\n"
-      "w_col_stride1 .req %x[w_col_stride]\n"
-      "w_col_stride2 .req x8\n"
-
-      // Prepare strides and pointers
-      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
-      "add uptr2,    uptr1 , %x[u_row_stride]\n"
-      "add uptr3,    uptr2 , %x[u_row_stride]\n"
-      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
-      "add wptr2,    wptr1 , %x[w_row_stride]\n"
-      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
-      "add u_col_stride2, %x[u_col_stride], %x[u_col_stride]\n"
-      "add u_col_stride3,    u_col_stride2 , %x[u_col_stride]\n"
-      "add w_col_stride2, %x[w_col_stride], %x[w_col_stride]\n"
-
-      // Load in preparation for execution
-      "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
-      "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
-      "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
-      "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
-      "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
-      "ldr qW11A, [%x[wptr0]], #0x10\n"
-      "ldr qU24A, [uptr1, u_col_stride3]\n"
-      "ldr qW23A, [wptr1, w_col_stride2]\n"
-      "ldr qU23A, [uptr1, u_col_stride2]\n"
-      "ldr qW22A, [wptr1, w_col_stride1]\n"
-      "ldr qU22A, [uptr1, u_col_stride1]\n"
-      "ldr qW21A, [wptr1], #0x10\n"
-      "ldr qU34A, [uptr2, u_col_stride3]\n"
-      "ldr qW33A, [wptr2, w_col_stride2]\n"
-      "ldr qU33A, [uptr2, u_col_stride2]\n"
-      "ldr qW32A, [wptr2, w_col_stride1]\n"
-      "ldr qU32A, [uptr2, u_col_stride1]\n"
-      "ldr qW31A, [wptr2], #0x10\n"
-      "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
-      "cbz %x[iters], 2f\n"  // Jump to tail if doing zero iterations of loop
-
-      "1:"  // Main loop body
-        // A part
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "ldr qW11B, [%x[wptr0]], #0x10\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "ldr qU24B, [uptr1, u_col_stride3]\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "ldr qW23B, [wptr1, w_col_stride2]\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "ldr qU23B, [uptr1, u_col_stride2]\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "ldr qW22B, [wptr1, w_col_stride1]\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "ldr qU22B, [uptr1, u_col_stride1]\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "ldr qW21B, [wptr1], #0x10\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "ldr qU34B, [uptr2, u_col_stride3]\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "ldr qW33B, [wptr2, w_col_stride2]\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, %x[v_col_stride]]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "ldr qU33B, [uptr2, u_col_stride2]\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "ldr qW32B, [wptr2, w_col_stride1]\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "ldr qU32B, [uptr2, u_col_stride1]\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "ldr qW31B, [wptr2], #0x10\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-
-        // B part
-        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
-        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
-        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
-        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
-        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
-        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
-        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
-        "subs %x[iters], %x[iters], #1\n"
-        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
-        "ldr qU44B, [uptr3, u_col_stride3]\n"
-        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
-        "ldr qU43B, [uptr3, u_col_stride2]\n"
-        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
-        "ldr qU42B, [uptr3, u_col_stride1]\n"
-        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
-        "ldr qU11B, [%x[uptr0]], #0x10\n"
-        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
-        "ldr qU21B, [uptr1], #0x10\n"
-        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
-        "ldr qU31B, [uptr2], #0x10\n"
-        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
-        "ldr qU41B, [uptr3], #0x10\n"
-        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
-        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
-        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
-        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
-        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
-        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
-        "ldr qW11A, [%x[wptr0]], #0x10\n"
-        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
-        "ldr qU24A, [uptr1, u_col_stride3]\n"
-        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
-        "ldr qW23A, [wptr1, w_col_stride2]\n"
-        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
-        "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
-        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
-        "ldr qU23A, [uptr1, u_col_stride2]\n"
-        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
-        "ldr qW22A, [wptr1, w_col_stride1]\n"
-        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
-        "ldr qU22A, [uptr1, u_col_stride1]\n"
-        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
-        "ldr qW21A, [wptr1], #0x10\n"
-        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
-        "ldr qU34A, [uptr2, u_col_stride3]\n"
-        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
-        "ldr qW33A, [wptr2, w_col_stride2]\n"
-        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
-        "str qV22B, [vptr1, %x[v_col_stride]]\n"
-        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
-        "ldr qU33A, [uptr2, u_col_stride2]\n"
-        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
-        "ldr qW32A, [wptr2, w_col_stride1]\n"
-        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
-        "ldr qU32A, [uptr2, u_col_stride1]\n"
-        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
-        "str qV11B, [%x[vptr0]], #0x10\n"
-        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
-        "ldr qW31A, [wptr2], #0x10\n"
-        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
-        "str qV21B, [vptr1], #0x10\n"
-        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
-        "bne 1b\n"  // Loop
-
-      "2:"  // Branch destination for zero loops
-        "cbnz %w[odd_tail], 4f\n"
-
-      "3:"  // Even number of iterations
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "ldr qW11B, [%x[wptr0]], #0x10\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "ldr qU24B, [uptr1, u_col_stride3]\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "ldr qW23B, [wptr1, w_col_stride2]\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "ldr qU23B, [uptr1, u_col_stride2]\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "ldr qW22B, [wptr1, w_col_stride1]\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "ldr qU22B, [uptr1, u_col_stride1]\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "ldr qW21B, [wptr1], #0x10\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "ldr qU34B, [uptr2, u_col_stride3]\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "ldr qW33B, [wptr2, w_col_stride2]\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, %x[v_col_stride]]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "ldr qU33B, [uptr2, u_col_stride2]\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "ldr qW32B, [wptr2, w_col_stride1]\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "ldr qU32B, [uptr2, u_col_stride1]\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "ldr qW31B, [wptr2], #0x10\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-
-        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
-        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
-        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
-        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
-        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
-        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
-        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
-        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
-        "ldr qU44B, [uptr3, u_col_stride3]\n"
-        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
-        "ldr qU43B, [uptr3, u_col_stride2]\n"
-        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
-        "ldr qU42B, [uptr3, u_col_stride1]\n"
-        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
-        "ldr qU11B, [%x[uptr0]], #0x10\n"
-        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
-        "ldr qU21B, [uptr1], #0x10\n"
-        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
-        "ldr qU31B, [uptr2], #0x10\n"
-        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
-        "ldr qU41B, [uptr3], #0x10\n"
-        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
-        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
-        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
-        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
-        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
-        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
-        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
-        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
-        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
-        "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
-        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
-        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
-        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
-        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
-        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
-        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
-        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
-        "str qV22B, [vptr1, %x[v_col_stride]]\n"
-        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
-        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
-        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
-        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
-        "str qV11B, [%x[vptr0]], #0x10\n"
-        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
-        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
-        "str qV21B, [vptr1], #0x10\n"
-        "b 5f\n"
-
-      "4:"  // Odd number of iterations
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, %x[v_col_stride]]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-
-      "5:"  // End of method
-
-      ".unreq qW11B\n" ".unreq qW33A\n" ".unreq qU32B\n"
-      ".unreq qU44B\n" ".unreq qW21A\n" ".unreq qU21B\n" ".unreq qU32A\n"
-      ".unreq qU43A\n" ".unreq qV21B\n"
-      ".unreq qU24A\n" ".unreq qU44A\n" ".unreq qU33B\n"
-      ".unreq qU31A\n" ".unreq qV12B\n" ".unreq qU23A\n"
-      ".unreq qW31B\n" ".unreq qV22A\n" ".unreq qV12A\n" ".unreq qW21B\n"
-      ".unreq qU22B\n" ".unreq qU34A\n" ".unreq qU13B\n" ".unreq qU13A\n"
-      ".unreq qU34B\n" ".unreq qU22A\n" ".unreq qU24B\n" ".unreq qU31B\n"
-      ".unreq qW12B\n" ".unreq qW13A\n" ".unreq qV21A\n" ".unreq qV11B\n"
-      ".unreq qW32A\n" ".unreq qW32B\n" ".unreq qW31A\n" ".unreq qV22B\n"
-      ".unreq qW11A\n" ".unreq qW13B\n" ".unreq qU14A\n"
-      ".unreq qU33A\n" ".unreq qW33B\n" ".unreq qW22A\n" ".unreq qU23B\n"
-      ".unreq qU12A\n" ".unreq qU42A\n" ".unreq qU41A\n" ".unreq qU42B\n"
-      ".unreq qW23A\n" ".unreq qW23B\n" ".unreq qU43B\n" ".unreq qU11A\n"
-      ".unreq qU12B\n" ".unreq qW12A\n" ".unreq qU41B\n" ".unreq qV11A\n"
-      ".unreq qW22B\n" ".unreq qU11B\n" ".unreq qU14B\n" ".unreq qU21A\n"
-      ".unreq vW11B\n" ".unreq vW33A\n" ".unreq vU32B\n"
-      ".unreq vU44B\n" ".unreq vW21A\n" ".unreq vU21B\n" ".unreq vU32A\n"
-      ".unreq vU43A\n" ".unreq vV21B\n"
-      ".unreq vU24A\n" ".unreq vU44A\n" ".unreq vU33B\n"
-      ".unreq vU31A\n" ".unreq vV12B\n" ".unreq vU23A\n"
-      ".unreq vW31B\n" ".unreq vV22A\n" ".unreq vV12A\n" ".unreq vW21B\n"
-      ".unreq vU22B\n" ".unreq vU34A\n" ".unreq vU13B\n" ".unreq vU13A\n"
-      ".unreq vU34B\n" ".unreq vU22A\n" ".unreq vU24B\n" ".unreq vU31B\n"
-      ".unreq vW12B\n" ".unreq vW13A\n" ".unreq vV21A\n" ".unreq vV11B\n"
-      ".unreq vW32A\n" ".unreq vW32B\n" ".unreq vW31A\n" ".unreq vV22B\n"
-      ".unreq vW11A\n" ".unreq vW13B\n" ".unreq vU14A\n"
-      ".unreq vU33A\n" ".unreq vW33B\n" ".unreq vW22A\n" ".unreq vU23B\n"
-      ".unreq vU12A\n" ".unreq vU42A\n" ".unreq vU41A\n" ".unreq vU42B\n"
-      ".unreq vW23A\n" ".unreq vW23B\n" ".unreq vU43B\n" ".unreq vU11A\n"
-      ".unreq vU12B\n" ".unreq vW12A\n" ".unreq vU41B\n" ".unreq vV11A\n"
-      ".unreq vW22B\n" ".unreq vU11B\n" ".unreq vU14B\n" ".unreq vU21A\n"
-      ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
-      ".unreq u_col_stride3\n"
-      ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n"
-      ".unreq wptr1\n" ".unreq wptr2\n" ".unreq vptr1\n"
-      ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
-
-      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
-        [iters] "+r" (n_iters)
-      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
-        [u_col_stride] "r" (in_col_stride * sizeof(float)),
-        [v_row_stride] "r" (out_row_stride * sizeof(float)),
-        [v_col_stride] "r" (out_col_stride * sizeof(float)),
-        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
-        [w_col_stride] "r" (weight_col_stride * sizeof(float)),
-        [odd_tail] "r" (odd_tail)
-      : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "cc",
-        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "memory"
-    );
-  }
-
-  if (channels_remaining)
-  {
-    // Fall back on the unoptimised version to clean up the tail
-    ConvImpl::process_tile<false>(
-        channels_remaining,
-        wptr0, weight_row_stride, weight_col_stride,
-        uptr0, in_row_stride, in_col_stride,
-        vptr0, out_row_stride, out_col_stride,
-        0, 0, 0, 0, 0, 0
-    );
-  }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x21, %[inptr0], %[input_row_stride]\n"
+    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x24, %[outptr0], %[output_row_stride]\n"
+    "add x27, x21, %[input_row_stride]\n"
+    "add x22, x23, %[input_col_stride1]\n"
+    "and x25, %[n_channels], #3\n"
+    "add x28, x27, %[input_row_stride]\n"
+    "lsr x26, %[n_channels], #2\n"
+    "cbz x26, 4f\n"
+    "1:\n"
+    "ldr q19, [%[wbptr]]\n"
+    "subs x26, x26, #1\n"
+    "mov v3.16b, v19.16b\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "mov v2.16b, v19.16b\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "mov v0.16b, v19.16b\n"
+    "ldr q13, [%[wbptr], #64]\n"
+    "ldr q23, [%[wbptr], #80]\n"
+    "ldr q15, [%[wbptr], #96]\n"
+    "ldr q20, [%[wbptr], #112]\n"
+    "ldr q21, [%[wbptr], #128]\n"
+    "ldr q14, [%[wbptr], #144]\n"
+    "ldr q16, [%[inptr0]]\n"
+    "fmla v3.4s, v16.4s, v12.4s\n"
+    "ldr q28, [x21]\n"
+    "fmla v1.4s, v28.4s, v12.4s\n"
+    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr q24, [x27]\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "ldr q8, [x21, %[input_col_stride1]]\n"
+    "ldr q9, [%[inptr0], x23]\n"
+    "ldr q18, [x28]\n"
+    "ldr q6, [x27, %[input_col_stride1]]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v3.4s, v24.4s, v20.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "ldr q28, [%[inptr0], x22]\n"
+    "fmla v2.4s, v8.4s, v13.4s\n"
+    "ldr q24, [x28, %[input_col_stride1]]\n"
+    "fmla v3.4s, v8.4s, v23.4s\n"
+    "ldr q27, [x27, x23]\n"
+    "fmla v1.4s, v8.4s, v11.4s\n"
+    "ldr q7, [x21, x22]\n"
+    "fmla v0.4s, v8.4s, v12.4s\n"
+    "ldr q17, [x28, x23]\n"
+    "fmla v3.4s, v9.4s, v10.4s\n"
+    "ldr q5, [x27, x22]\n"
+    "fmla v2.4s, v9.4s, v11.4s\n"
+    "ldr q4, [x28, x22]\n"
+    "fmla v1.4s, v18.4s, v20.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v3.4s, v6.4s, v21.4s\n"
+    "ldr q19, [%[wbptr]]\n"
+    "fmla v2.4s, v6.4s, v20.4s\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "fmla v1.4s, v6.4s, v23.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v6.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v3.4s, v25.4s, v15.4s\n"
+    "ldr q16, [%[inptr0]]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v25.4s, v23.4s\n"
+    "ldr q13, [%[wbptr], #64]\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "ldr q9, [%[inptr0], x23]\n"
+    "fmla v1.4s, v24.4s, v21.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "ldr q28, [x21]\n"
+    "fmla v0.4s, v24.4s, v20.4s\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "ldr q8, [x21, %[input_col_stride1]]\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v2.4s, v27.4s, v21.4s\n"
+    "ldr q20, [%[wbptr], #112]\n"
+    "fmla v0.4s, v27.4s, v23.4s\n"
+    "ldr q24, [x27]\n"
+    "fmla v1.4s, v17.4s, v14.4s\n"
+    "ldr q6, [x27, %[input_col_stride1]]\n"
+    "fmla v2.4s, v7.4s, v15.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v0.4s, v7.4s, v10.4s\n"
+    "ldr q23, [%[wbptr], #80]\n"
+    "movi v25.16b, #0\n"
+    "ldr q18, [x28]\n"
+    "fmla v2.4s, v5.4s, v14.4s\n"
+    "subs x26, x26, #1\n"
+    "fmla v0.4s, v17.4s, v21.4s\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "fmov v26.4s, #6.0\n"
+    "fmax v3.4s, v3.4s, v25.4s\n"
+    "fmax v2.4s, v2.4s, v25.4s\n"
+    "fmax v1.4s, v1.4s, v25.4s\n"
+    "fmla v0.4s, v5.4s, v15.4s\n"
+    "ldr q21, [%[wbptr], #128]\n"
+    "fmin v3.4s, v3.4s, v26.4s\n"
+    "fmin v2.4s, v2.4s, v26.4s\n"
+    "fmin v1.4s, v1.4s, v26.4s\n"
+    "str q3, [%[outptr0]]\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v4.4s, v14.4s\n"
+    "str q1, [x24]\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr q15, [%[wbptr], #96]\n"
+    "fmax v0.4s, v0.4s, v25.4s\n"
+    "ldr q14, [%[wbptr], #144]\n"
+    "mov v2.16b, v19.16b\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmin v0.4s, v0.4s, v26.4s\n"
+    "fmla v3.4s, v16.4s, v12.4s\n"
+    "fmla v1.4s, v28.4s, v12.4s\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "str q0, [x24, %[output_col_stride1]]\n"
+    "mov v0.16b, v19.16b\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v3.4s, v24.4s, v20.4s\n"
+    "ldr q25, [x21, x23]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "ldr q28, [%[inptr0], x22]\n"
+    "fmla v2.4s, v8.4s, v13.4s\n"
+    "ldr q24, [x28, %[input_col_stride1]]\n"
+    "fmla v3.4s, v8.4s, v23.4s\n"
+    "ldr q27, [x27, x23]\n"
+    "fmla v1.4s, v8.4s, v11.4s\n"
+    "ldr q7, [x21, x22]\n"
+    "fmla v0.4s, v8.4s, v12.4s\n"
+    "ldr q17, [x28, x23]\n"
+    "fmla v3.4s, v9.4s, v10.4s\n"
+    "ldr q5, [x27, x22]\n"
+    "fmla v2.4s, v9.4s, v11.4s\n"
+    "ldr q4, [x28, x22]\n"
+    "fmla v1.4s, v18.4s, v20.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v3.4s, v6.4s, v21.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v2.4s, v6.4s, v20.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v1.4s, v6.4s, v23.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v0.4s, v6.4s, v13.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v3.4s, v25.4s, v15.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v23.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "movi v25.16b, #0\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "fmov v26.4s, #6.0\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v21.4s\n"
+    "fmla v0.4s, v24.4s, v20.4s\n"
+    "fmax v3.4s, v3.4s, v25.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v2.4s, v27.4s, v21.4s\n"
+    "fmla v0.4s, v27.4s, v23.4s\n"
+    "fmin v3.4s, v3.4s, v26.4s\n"
+    "str q3, [%[outptr0]]\n"
+    "fmla v2.4s, v7.4s, v15.4s\n"
+    "fmla v0.4s, v7.4s, v10.4s\n"
+    "fmla v1.4s, v17.4s, v14.4s\n"
+    "fmla v2.4s, v5.4s, v14.4s\n"
+    "fmla v0.4s, v17.4s, v21.4s\n"
+    "fmax v1.4s, v1.4s, v25.4s\n"
+    "fmax v2.4s, v2.4s, v25.4s\n"
+    "fmla v0.4s, v5.4s, v15.4s\n"
+    "fmin v1.4s, v1.4s, v26.4s\n"
+    "fmin v2.4s, v2.4s, v26.4s\n"
+    "str q1, [x24]\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v4.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmax v0.4s, v0.4s, v25.4s\n"
+    "fmin v0.4s, v0.4s, v26.4s\n"
+    "str q0, [x24, %[output_col_stride1]]\n"
+    "add x24, x24, #16\n"
+    "4:\n"
+    "cbz x25, 7f\n"
+    "ldr s19, [%[wbptr]]\n"
+    "mov v3.16b, v19.16b\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "mov v2.16b, v19.16b\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "mov v0.16b, v19.16b\n"
+    "ldr s13, [%[wbptr], #16]\n"
+    "ldr s23, [%[wbptr], #20]\n"
+    "subs x25, x25, #1\n"
+    "ldr s15, [%[wbptr], #24]\n"
+    "ldr s20, [%[wbptr], #28]\n"
+    "ldr s21, [%[wbptr], #32]\n"
+    "ldr s14, [%[wbptr], #36]\n"
+    "ldr s16, [%[inptr0]]\n"
+    "ldr s28, [x21]\n"
+    "fmla v3.4s, v16.4s, v12.4s\n"
+    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v1.4s, v28.4s, v12.4s\n"
+    "ldr s24, [x27]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr s8, [x21, %[input_col_stride1]]\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "ldr s9, [%[inptr0], x23]\n"
+    "ldr s18, [x28]\n"
+    "ldr s6, [x27, %[input_col_stride1]]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v3.4s, v24.4s, v20.4s\n"
+    "ldr s25, [x21, x23]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "ldr s28, [%[inptr0], x22]\n"
+    "fmla v2.4s, v8.4s, v13.4s\n"
+    "ldr s24, [x28, %[input_col_stride1]]\n"
+    "fmla v3.4s, v8.4s, v23.4s\n"
+    "ldr s27, [x27, x23]\n"
+    "fmla v1.4s, v8.4s, v11.4s\n"
+    "ldr s7, [x21, x22]\n"
+    "fmla v0.4s, v8.4s, v12.4s\n"
+    "ldr s17, [x28, x23]\n"
+    "fmla v3.4s, v9.4s, v10.4s\n"
+    "ldr s5, [x27, x22]\n"
+    "fmla v2.4s, v9.4s, v11.4s\n"
+    "ldr s4, [x28, x22]\n"
+    "fmla v1.4s, v18.4s, v20.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v3.4s, v6.4s, v21.4s\n"
+    "ldr s19, [%[wbptr]]\n"
+    "fmla v2.4s, v6.4s, v20.4s\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "fmla v1.4s, v6.4s, v23.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v6.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v3.4s, v25.4s, v15.4s\n"
+    "ldr s16, [%[inptr0]]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v25.4s, v23.4s\n"
+    "ldr s13, [%[wbptr], #16]\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "ldr s9, [%[inptr0], x23]\n"
+    "fmla v1.4s, v24.4s, v21.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "ldr s28, [x21]\n"
+    "fmla v0.4s, v24.4s, v20.4s\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "ldr s8, [x21, %[input_col_stride1]]\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v2.4s, v27.4s, v21.4s\n"
+    "ldr s20, [%[wbptr], #28]\n"
+    "fmla v0.4s, v27.4s, v23.4s\n"
+    "ldr s24, [x27]\n"
+    "fmla v1.4s, v17.4s, v14.4s\n"
+    "ldr s6, [x27, %[input_col_stride1]]\n"
+    "fmla v2.4s, v7.4s, v15.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v0.4s, v7.4s, v10.4s\n"
+    "ldr s23, [%[wbptr], #20]\n"
+    "movi v25.16b, #0\n"
+    "ldr s18, [x28]\n"
+    "fmla v2.4s, v5.4s, v14.4s\n"
+    "subs x25, x25, #1\n"
+    "fmla v0.4s, v17.4s, v21.4s\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "fmov v26.4s, #6.0\n"
+    "fmax v3.4s, v3.4s, v25.4s\n"
+    "fmax v2.4s, v2.4s, v25.4s\n"
+    "fmax v1.4s, v1.4s, v25.4s\n"
+    "fmla v0.4s, v5.4s, v15.4s\n"
+    "ldr s21, [%[wbptr], #32]\n"
+    "fmin v3.4s, v3.4s, v26.4s\n"
+    "fmin v2.4s, v2.4s, v26.4s\n"
+    "fmin v1.4s, v1.4s, v26.4s\n"
+    "str s3, [%[outptr0]]\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v4.4s, v14.4s\n"
+    "str s1, [x24]\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr s15, [%[wbptr], #24]\n"
+    "fmax v0.4s, v0.4s, v25.4s\n"
+    "ldr s14, [%[wbptr], #36]\n"
+    "mov v2.16b, v19.16b\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmin v0.4s, v0.4s, v26.4s\n"
+    "fmla v3.4s, v16.4s, v12.4s\n"
+    "fmla v1.4s, v28.4s, v12.4s\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "str s0, [x24, %[output_col_stride1]]\n"
+    "mov v0.16b, v19.16b\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v3.4s, v24.4s, v20.4s\n"
+    "ldr s25, [x21, x23]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "ldr s28, [%[inptr0], x22]\n"
+    "fmla v2.4s, v8.4s, v13.4s\n"
+    "ldr s24, [x28, %[input_col_stride1]]\n"
+    "fmla v3.4s, v8.4s, v23.4s\n"
+    "ldr s27, [x27, x23]\n"
+    "fmla v1.4s, v8.4s, v11.4s\n"
+    "ldr s7, [x21, x22]\n"
+    "fmla v0.4s, v8.4s, v12.4s\n"
+    "ldr s17, [x28, x23]\n"
+    "fmla v3.4s, v9.4s, v10.4s\n"
+    "ldr s5, [x27, x22]\n"
+    "fmla v2.4s, v9.4s, v11.4s\n"
+    "ldr s4, [x28, x22]\n"
+    "fmla v1.4s, v18.4s, v20.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v3.4s, v6.4s, v21.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v2.4s, v6.4s, v20.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v1.4s, v6.4s, v23.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v0.4s, v6.4s, v13.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v3.4s, v25.4s, v15.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v23.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "movi v25.16b, #0\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "fmov v26.4s, #6.0\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v21.4s\n"
+    "fmla v0.4s, v24.4s, v20.4s\n"
+    "fmax v3.4s, v3.4s, v25.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v2.4s, v27.4s, v21.4s\n"
+    "fmla v0.4s, v27.4s, v23.4s\n"
+    "fmin v3.4s, v3.4s, v26.4s\n"
+    "str s3, [%[outptr0]]\n"
+    "fmla v2.4s, v7.4s, v15.4s\n"
+    "fmla v0.4s, v7.4s, v10.4s\n"
+    "fmla v1.4s, v17.4s, v14.4s\n"
+    "fmla v2.4s, v5.4s, v14.4s\n"
+    "fmla v0.4s, v17.4s, v21.4s\n"
+    "fmax v1.4s, v1.4s, v25.4s\n"
+    "fmax v2.4s, v2.4s, v25.4s\n"
+    "fmla v0.4s, v5.4s, v15.4s\n"
+    "fmin v1.4s, v1.4s, v26.4s\n"
+    "fmin v2.4s, v2.4s, v26.4s\n"
+    "str s1, [x24]\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v4.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmax v0.4s, v0.4s, v25.4s\n"
+    "fmin v0.4s, v0.4s, v26.4s\n"
+    "str s0, [x24, %[output_col_stride1]]\n"
+    "add x24, x24, #4\n"
+    "7:\n"
+    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
 }
 
 #endif  // __aarch64__
 
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
 
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 9ce43f9..010dd81 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,74 +25,2785 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+
+#ifdef __aarch64__
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x23, %[inptr0], %[input_row_stride]\n"
+    "add x19, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x22, %[outptr0], %[output_row_stride]\n"
+    "add x24, x23, %[input_row_stride]\n"
+    "add x20, x19, %[input_col_stride1]\n"
+    "and x27, %[n_channels], #3\n"
+    "add x25, x24, %[input_row_stride]\n"
+    "add x21, x20, %[input_col_stride1]\n"
+    "lsr x28, %[n_channels], #2\n"
+    "add x26, x25, %[input_row_stride]\n"
+    "cbz x28, 4f\n"
+    "1:\n"
+    "ldr q14, [%[wbptr]]\n"
+    "subs x28, x28, #1\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr q8, [%[wbptr], #16]\n"
+    "mov v10.16b, v14.16b\n"
+    "ldr q7, [%[wbptr], #32]\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr q6, [%[wbptr], #48]\n"
+    "mov v9.16b, v14.16b\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "ldr q15, [%[inptr0]]\n"
+    "fmla v12.4s, v15.4s, v8.4s\n"
+    "ldr q20, [x23]\n"
+    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q17, [x24]\n"
+    "fmla v10.4s, v17.4s, v8.4s\n"
+    "ldr q16, [x23, %[input_col_stride1]]\n"
+    "fmla v12.4s, v20.4s, v5.4s\n"
+    "ldr q18, [%[inptr0], x19]\n"
+    "ldr q14, [x25]\n"
+    "ldr q15, [x24, %[input_col_stride1]]\n"
+    "fmla v12.4s, v13.4s, v7.4s\n"
+    "fmla v12.4s, v17.4s, v2.4s\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v11.4s, v18.4s, v8.4s\n"
+    "ldr q19, [x23, x19]\n"
+    "fmla v10.4s, v14.4s, v5.4s\n"
+    "ldr q20, [%[inptr0], x20]\n"
+    "fmla v12.4s, v15.4s, v1.4s\n"
+    "ldr q14, [x26]\n"
+    "fmla v11.4s, v19.4s, v5.4s\n"
+    "ldr q13, [x25, %[input_col_stride1]]\n"
+    "fmla v10.4s, v15.4s, v7.4s\n"
+    "ldr q17, [x24, x19]\n"
+    "fmla v12.4s, v19.4s, v3.4s\n"
+    "ldr q19, [x23, x20]\n"
+    "fmla v11.4s, v20.4s, v7.4s\n"
+    "ldr q18, [%[inptr0], x21]\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "ldr q16, [x26, %[input_col_stride1]]\n"
+    "fmla v12.4s, v17.4s, v0.4s\n"
+    "ldr q14, [x25, x19]\n"
+    "fmla v11.4s, v17.4s, v2.4s\n"
+    "ldr q15, [x24, x20]\n"
+    "fmla v10.4s, v13.4s, v4.4s\n"
+    "ldr q13, [x23, x21]\n"
+    "str q12, [%[outptr0]]\n"
+    "fmla v9.4s, v17.4s, v8.4s\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr q12, [x26, x19]\n"
+    "fmla v10.4s, v17.4s, v6.4s\n"
+    "ldr q20, [x25, x20]\n"
+    "fmla v9.4s, v14.4s, v5.4s\n"
+    "ldr q17, [x24, x21]\n"
+    "fmla v11.4s, v18.4s, v6.4s\n"
+    "ldr q19, [x26, x20]\n"
+    "fmla v10.4s, v16.4s, v1.4s\n"
+    "ldr q18, [x25, x21]\n"
+    "fmla v9.4s, v15.4s, v7.4s\n"
+    "ldr q16, [x26, x21]\n"
+    "fmla v11.4s, v15.4s, v1.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr q14, [%[wbptr]]\n"
+    "fmla v9.4s, v12.4s, v2.4s\n"
+    "ldr q8, [%[wbptr], #16]\n"
+    "fmla v11.4s, v13.4s, v3.4s\n"
+    "ldr q7, [%[wbptr], #32]\n"
+    "fmla v10.4s, v12.4s, v0.4s\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "fmla v9.4s, v20.4s, v4.4s\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "fmla v11.4s, v17.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "str q10, [x22]\n"
+    "mov v12.16b, v14.16b\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "str q11, [%[outptr0], %[output_col_stride1]]\n"
+    "mov v10.16b, v14.16b\n"
+    "mov v11.16b, v14.16b\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr q6, [%[wbptr], #48]\n"
+    "ldr q15, [%[inptr0]]\n"
+    "add x23, x23, #16\n"
+    "fmla v12.4s, v15.4s, v8.4s\n"
+    "ldr q20, [x23]\n"
+    "fmla v9.4s, v18.4s, v3.4s\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
+    "add x24, x24, #16\n"
+    "fmla v12.4s, v20.4s, v5.4s\n"
+    "ldr q17, [x24]\n"
+    "fmla v9.4s, v16.4s, v0.4s\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "fmla v10.4s, v17.4s, v8.4s\n"
+    "ldr q16, [x23, %[input_col_stride1]]\n"
+    "fmla v12.4s, v13.4s, v7.4s\n"
+    "ldr q18, [%[inptr0], x19]\n"
+    "str q9, [x22, %[output_col_stride1]]\n"
+    "add x25, x25, #16\n"
+    "mov v9.16b, v14.16b\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "fmla v12.4s, v17.4s, v2.4s\n"
+    "ldr q14, [x25]\n"
+    "ldr q15, [x24, %[input_col_stride1]]\n"
+    "add x26, x26, #16\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "add x22, x22, #16\n"
+    "subs x28, x28, #1\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v11.4s, v18.4s, v8.4s\n"
+    "ldr q19, [x23, x19]\n"
+    "fmla v10.4s, v14.4s, v5.4s\n"
+    "ldr q20, [%[inptr0], x20]\n"
+    "fmla v12.4s, v15.4s, v1.4s\n"
+    "ldr q14, [x26]\n"
+    "fmla v11.4s, v19.4s, v5.4s\n"
+    "ldr q13, [x25, %[input_col_stride1]]\n"
+    "fmla v10.4s, v15.4s, v7.4s\n"
+    "ldr q17, [x24, x19]\n"
+    "fmla v12.4s, v19.4s, v3.4s\n"
+    "ldr q19, [x23, x20]\n"
+    "fmla v11.4s, v20.4s, v7.4s\n"
+    "ldr q18, [%[inptr0], x21]\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "ldr q16, [x26, %[input_col_stride1]]\n"
+    "fmla v12.4s, v17.4s, v0.4s\n"
+    "ldr q14, [x25, x19]\n"
+    "fmla v11.4s, v17.4s, v2.4s\n"
+    "ldr q15, [x24, x20]\n"
+    "fmla v10.4s, v13.4s, v4.4s\n"
+    "ldr q13, [x23, x21]\n"
+    "str q12, [%[outptr0]]\n"
+    "fmla v9.4s, v17.4s, v8.4s\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr q12, [x26, x19]\n"
+    "fmla v10.4s, v17.4s, v6.4s\n"
+    "ldr q20, [x25, x20]\n"
+    "fmla v9.4s, v14.4s, v5.4s\n"
+    "ldr q17, [x24, x21]\n"
+    "fmla v11.4s, v18.4s, v6.4s\n"
+    "ldr q19, [x26, x20]\n"
+    "fmla v10.4s, v16.4s, v1.4s\n"
+    "ldr q18, [x25, x21]\n"
+    "fmla v9.4s, v15.4s, v7.4s\n"
+    "ldr q16, [x26, x21]\n"
+    "fmla v11.4s, v15.4s, v1.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v12.4s, v2.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v11.4s, v13.4s, v3.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v10.4s, v12.4s, v0.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v9.4s, v20.4s, v4.4s\n"
+    "add x25, x25, #16\n"
+    "fmla v11.4s, v17.4s, v0.4s\n"
+    "add x26, x26, #16\n"
+    "str q10, [x22]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "str q11, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v9.4s, v18.4s, v3.4s\n"
+    "fmla v9.4s, v16.4s, v0.4s\n"
+    "str q9, [x22, %[output_col_stride1]]\n"
+    "add x22, x22, #16\n"
+    "4:\n"
+    "cbz x27, 7f\n"
+    "ldr s14, [%[wbptr]]\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr s8, [%[wbptr], #4]\n"
+    "mov v10.16b, v14.16b\n"
+    "ldr s7, [%[wbptr], #8]\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr s6, [%[wbptr], #12]\n"
+    "mov v9.16b, v14.16b\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "subs x27, x27, #1\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "ldr s15, [%[inptr0]]\n"
+    "ldr s20, [x23]\n"
+    "fmla v12.4s, v15.4s, v8.4s\n"
+    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s17, [x24]\n"
+    "ldr s16, [x23, %[input_col_stride1]]\n"
+    "fmla v10.4s, v17.4s, v8.4s\n"
+    "ldr s18, [%[inptr0], x19]\n"
+    "fmla v12.4s, v20.4s, v5.4s\n"
+    "ldr s14, [x25]\n"
+    "ldr s15, [x24, %[input_col_stride1]]\n"
+    "fmla v12.4s, v13.4s, v7.4s\n"
+    "fmla v12.4s, v17.4s, v2.4s\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v11.4s, v18.4s, v8.4s\n"
+    "ldr s19, [x23, x19]\n"
+    "fmla v10.4s, v14.4s, v5.4s\n"
+    "ldr s20, [%[inptr0], x20]\n"
+    "fmla v12.4s, v15.4s, v1.4s\n"
+    "ldr s14, [x26]\n"
+    "fmla v11.4s, v19.4s, v5.4s\n"
+    "ldr s13, [x25, %[input_col_stride1]]\n"
+    "fmla v10.4s, v15.4s, v7.4s\n"
+    "ldr s17, [x24, x19]\n"
+    "fmla v12.4s, v19.4s, v3.4s\n"
+    "ldr s19, [x23, x20]\n"
+    "fmla v11.4s, v20.4s, v7.4s\n"
+    "ldr s18, [%[inptr0], x21]\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "ldr s16, [x26, %[input_col_stride1]]\n"
+    "fmla v12.4s, v17.4s, v0.4s\n"
+    "ldr s14, [x25, x19]\n"
+    "fmla v11.4s, v17.4s, v2.4s\n"
+    "ldr s15, [x24, x20]\n"
+    "fmla v10.4s, v13.4s, v4.4s\n"
+    "ldr s13, [x23, x21]\n"
+    "str s12, [%[outptr0]]\n"
+    "fmla v9.4s, v17.4s, v8.4s\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr s12, [x26, x19]\n"
+    "fmla v10.4s, v17.4s, v6.4s\n"
+    "ldr s20, [x25, x20]\n"
+    "fmla v9.4s, v14.4s, v5.4s\n"
+    "ldr s17, [x24, x21]\n"
+    "fmla v11.4s, v18.4s, v6.4s\n"
+    "ldr s19, [x26, x20]\n"
+    "fmla v10.4s, v16.4s, v1.4s\n"
+    "ldr s18, [x25, x21]\n"
+    "fmla v9.4s, v15.4s, v7.4s\n"
+    "ldr s16, [x26, x21]\n"
+    "fmla v11.4s, v15.4s, v1.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr s14, [%[wbptr]]\n"
+    "fmla v9.4s, v12.4s, v2.4s\n"
+    "ldr s8, [%[wbptr], #4]\n"
+    "fmla v11.4s, v13.4s, v3.4s\n"
+    "ldr s7, [%[wbptr], #8]\n"
+    "fmla v10.4s, v12.4s, v0.4s\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "fmla v9.4s, v20.4s, v4.4s\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "fmla v11.4s, v17.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "str s10, [x22]\n"
+    "mov v12.16b, v14.16b\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "str s11, [%[outptr0], %[output_col_stride1]]\n"
+    "mov v10.16b, v14.16b\n"
+    "mov v11.16b, v14.16b\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr s6, [%[wbptr], #12]\n"
+    "ldr s15, [%[inptr0]]\n"
+    "add x23, x23, #4\n"
+    "fmla v12.4s, v15.4s, v8.4s\n"
+    "ldr s20, [x23]\n"
+    "fmla v9.4s, v18.4s, v3.4s\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
+    "add x24, x24, #4\n"
+    "fmla v12.4s, v20.4s, v5.4s\n"
+    "ldr s17, [x24]\n"
+    "fmla v9.4s, v16.4s, v0.4s\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "fmla v10.4s, v17.4s, v8.4s\n"
+    "ldr s16, [x23, %[input_col_stride1]]\n"
+    "fmla v12.4s, v13.4s, v7.4s\n"
+    "ldr s18, [%[inptr0], x19]\n"
+    "str s9, [x22, %[output_col_stride1]]\n"
+    "add x25, x25, #4\n"
+    "mov v9.16b, v14.16b\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "fmla v12.4s, v17.4s, v2.4s\n"
+    "ldr s14, [x25]\n"
+    "ldr s15, [x24, %[input_col_stride1]]\n"
+    "add x26, x26, #4\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "add x22, x22, #4\n"
+    "subs x27, x27, #1\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v11.4s, v18.4s, v8.4s\n"
+    "ldr s19, [x23, x19]\n"
+    "fmla v10.4s, v14.4s, v5.4s\n"
+    "ldr s20, [%[inptr0], x20]\n"
+    "fmla v12.4s, v15.4s, v1.4s\n"
+    "ldr s14, [x26]\n"
+    "fmla v11.4s, v19.4s, v5.4s\n"
+    "ldr s13, [x25, %[input_col_stride1]]\n"
+    "fmla v10.4s, v15.4s, v7.4s\n"
+    "ldr s17, [x24, x19]\n"
+    "fmla v12.4s, v19.4s, v3.4s\n"
+    "ldr s19, [x23, x20]\n"
+    "fmla v11.4s, v20.4s, v7.4s\n"
+    "ldr s18, [%[inptr0], x21]\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "ldr s16, [x26, %[input_col_stride1]]\n"
+    "fmla v12.4s, v17.4s, v0.4s\n"
+    "ldr s14, [x25, x19]\n"
+    "fmla v11.4s, v17.4s, v2.4s\n"
+    "ldr s15, [x24, x20]\n"
+    "fmla v10.4s, v13.4s, v4.4s\n"
+    "ldr s13, [x23, x21]\n"
+    "str s12, [%[outptr0]]\n"
+    "fmla v9.4s, v17.4s, v8.4s\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr s12, [x26, x19]\n"
+    "fmla v10.4s, v17.4s, v6.4s\n"
+    "ldr s20, [x25, x20]\n"
+    "fmla v9.4s, v14.4s, v5.4s\n"
+    "ldr s17, [x24, x21]\n"
+    "fmla v11.4s, v18.4s, v6.4s\n"
+    "ldr s19, [x26, x20]\n"
+    "fmla v10.4s, v16.4s, v1.4s\n"
+    "ldr s18, [x25, x21]\n"
+    "fmla v9.4s, v15.4s, v7.4s\n"
+    "ldr s16, [x26, x21]\n"
+    "fmla v11.4s, v15.4s, v1.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v12.4s, v2.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v11.4s, v13.4s, v3.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v10.4s, v12.4s, v0.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v9.4s, v20.4s, v4.4s\n"
+    "add x25, x25, #4\n"
+    "fmla v11.4s, v17.4s, v0.4s\n"
+    "add x26, x26, #4\n"
+    "str s10, [x22]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "str s11, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v9.4s, v18.4s, v3.4s\n"
+    "fmla v9.4s, v16.4s, v0.4s\n"
+    "str s9, [x22, %[output_col_stride1]]\n"
+    "add x22, x22, #4\n"
+    "7:\n"
+    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+    : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
 template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  __asm __volatile(
+    "mov x23, xzr\n"
+    "mov x24, xzr\n"
+    "and x25, %[n_channels], #3\n"
+    "lsr x26, %[n_channels], #2\n"
+    "cbz x26, 4f\n"
+    "1:\n"
+    "ldr q13, [%[wbptr]]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "mov v10.16b, v13.16b\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "mov v8.16b, v13.16b\n"
+    "ldr q6, [%[wbptr], #32]\n"
+    "mov v9.16b, v13.16b\n"
+    "ldr q5, [%[wbptr], #48]\n"
+    "mov v7.16b, v13.16b\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "ldr x27, [%[inptrs], 120]\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "subs x26, x26, #1\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "ldr q14, [x19, x23]\n"
+    "fmla v10.4s, v14.4s, v12.4s\n"
+    "ldr q18, [x20, x23]\n"
+    "ldr q14, [x21, x23]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "ldr q19, [x19, x23]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "fmla v10.4s, v18.4s, v11.4s\n"
+    "ldr q15, [x20, x23]\n"
+    "ldr q18, [x21, x23]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "ldr q13, [x19, x23]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v8.4s, v14.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v10.4s, v15.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v9.4s, v13.4s, v12.4s\n"
+    "ldr q14, [x20, x23]\n"
+    "ldr q17, [x19, x23]\n"
+    "ldr x22, [%[inptrs], 160]\n"
+    "fmla v8.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 128]\n"
+    "fmla v10.4s, v13.4s, v5.4s\n"
+    "ldr q15, [x22, x23]\n"
+    "fmla v9.4s, v14.4s, v11.4s\n"
+    "ldr q19, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v8.4s, v18.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 168]\n"
+    "fmla v10.4s, v18.4s, v1.4s\n"
+    "ldr q13, [x21, x23]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr q18, [x20, x23]\n"
+    "fmla v7.4s, v13.4s, v12.4s\n"
+    "ldr q17, [x19, x23]\n"
+    "fmla v8.4s, v15.4s, v2.4s\n"
+    "ldr q15, [x22, x23]\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr x27, [%[inptrs], 136]\n"
+    "fmla v9.4s, v13.4s, v2.4s\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v8.4s, v19.4s, v4.4s\n"
+    "ldr q19, [x21, x23]\n"
+    "fmla v10.4s, v13.4s, v0.4s\n"
+    "ldr q12, [x20, x23]\n"
+    "fmla v9.4s, v18.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 176]\n"
+    "fmla v7.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 144]\n"
+    "fmla v8.4s, v13.4s, v5.4s\n"
+    "ldr q11, [x22, x23]\n"
+    "ldr q13, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v9.4s, v17.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 184]\n"
+    "fmla v7.4s, v19.4s, v6.4s\n"
+    "ldr q14, [x21, x23]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr q17, [x22, x23]\n"
+    "ldr x27, [%[inptrs], 152]\n"
+    "ldr x22, [%[inptrs], 192]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str q10, [x21, x24]\n"
+    "fmla v7.4s, v11.4s, v2.4s\n"
+    "fmla v8.4s, v16.4s, v3.4s\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr q15, [x22, x23]\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v9.4s, v12.4s, v3.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v7.4s, v13.4s, v4.4s\n"
+    "ldr q13, [%[wbptr]]\n"
+    "fmla v8.4s, v11.4s, v0.4s\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "mov v10.16b, v13.16b\n"
+    "ldr q6, [%[wbptr], #32]\n"
+    "fmla v9.4s, v14.4s, v0.4s\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "fmla v7.4s, v14.4s, v5.4s\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "str q8, [x28, x24]\n"
+    "add x23, x23, #16\n"
+    "mov v8.16b, v13.16b\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "str q9, [x21, x24]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "fmla v7.4s, v17.4s, v1.4s\n"
+    "ldr q5, [%[wbptr], #48]\n"
+    "mov v9.16b, v13.16b\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr x27, [%[inptrs], 120]\n"
+    "subs x26, x26, #1\n"
+    "fmla v7.4s, v16.4s, v3.4s\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "ldr q14, [x19, x23]\n"
+    "fmla v10.4s, v14.4s, v12.4s\n"
+    "ldr q18, [x20, x23]\n"
+    "ldr q14, [x21, x23]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "fmla v7.4s, v15.4s, v0.4s\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "ldr q19, [x19, x23]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "fmla v10.4s, v18.4s, v11.4s\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr q15, [x20, x23]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "str q7, [x28, x24]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "mov v7.16b, v13.16b\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr q13, [x19, x23]\n"
+    "ldr q18, [x21, x23]\n"
+    "add x24, x24, #16\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v8.4s, v14.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v10.4s, v15.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v9.4s, v13.4s, v12.4s\n"
+    "ldr q14, [x20, x23]\n"
+    "ldr q17, [x19, x23]\n"
+    "ldr x22, [%[inptrs], 160]\n"
+    "fmla v8.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 128]\n"
+    "fmla v10.4s, v13.4s, v5.4s\n"
+    "ldr q15, [x22, x23]\n"
+    "fmla v9.4s, v14.4s, v11.4s\n"
+    "ldr q19, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v8.4s, v18.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 168]\n"
+    "fmla v10.4s, v18.4s, v1.4s\n"
+    "ldr q13, [x21, x23]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr q18, [x20, x23]\n"
+    "fmla v7.4s, v13.4s, v12.4s\n"
+    "ldr q17, [x19, x23]\n"
+    "fmla v8.4s, v15.4s, v2.4s\n"
+    "ldr q15, [x22, x23]\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr x27, [%[inptrs], 136]\n"
+    "fmla v9.4s, v13.4s, v2.4s\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v8.4s, v19.4s, v4.4s\n"
+    "ldr q19, [x21, x23]\n"
+    "fmla v10.4s, v13.4s, v0.4s\n"
+    "ldr q12, [x20, x23]\n"
+    "fmla v9.4s, v18.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 176]\n"
+    "fmla v7.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 144]\n"
+    "fmla v8.4s, v13.4s, v5.4s\n"
+    "ldr q11, [x22, x23]\n"
+    "ldr q13, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v9.4s, v17.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 184]\n"
+    "fmla v7.4s, v19.4s, v6.4s\n"
+    "ldr q14, [x21, x23]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr q17, [x22, x23]\n"
+    "ldr x27, [%[inptrs], 152]\n"
+    "ldr x22, [%[inptrs], 192]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str q10, [x21, x24]\n"
+    "fmla v7.4s, v11.4s, v2.4s\n"
+    "fmla v8.4s, v16.4s, v3.4s\n"
+    "ldr q16, [x27, x23]\n"
+    "ldr q15, [x22, x23]\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v9.4s, v12.4s, v3.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v7.4s, v13.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v8.4s, v11.4s, v0.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v9.4s, v14.4s, v0.4s\n"
+    "fmla v7.4s, v14.4s, v5.4s\n"
+    "str q8, [x28, x24]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "str q9, [x21, x24]\n"
+    "fmla v7.4s, v17.4s, v1.4s\n"
+    "fmla v7.4s, v16.4s, v3.4s\n"
+    "fmla v7.4s, v15.4s, v0.4s\n"
+    "str q7, [x28, x24]\n"
+    "add x24, x24, #16\n"
+    "4:\n"
+    "cbz x25, 7f\n"
+    "ldr s13, [%[wbptr]]\n"
+    "mov v10.16b, v13.16b\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "mov v8.16b, v13.16b\n"
+    "ldr s6, [%[wbptr], #8]\n"
+    "mov v9.16b, v13.16b\n"
+    "ldr s5, [%[wbptr], #12]\n"
+    "mov v7.16b, v13.16b\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "ldr x27, [%[inptrs], 120]\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "subs x25, x25, #1\n"
+    "ldr s14, [x19, x23]\n"
+    "ldr s18, [x20, x23]\n"
+    "fmla v10.4s, v14.4s, v12.4s\n"
+    "ldr s14, [x21, x23]\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "ldr s19, [x19, x23]\n"
+    "fmla v10.4s, v18.4s, v11.4s\n"
+    "ldr s15, [x20, x23]\n"
+    "ldr s18, [x21, x23]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "ldr s13, [x19, x23]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v8.4s, v14.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v10.4s, v15.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v9.4s, v13.4s, v12.4s\n"
+    "ldr s14, [x20, x23]\n"
+    "ldr s17, [x19, x23]\n"
+    "ldr x22, [%[inptrs], 160]\n"
+    "fmla v8.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 128]\n"
+    "fmla v10.4s, v13.4s, v5.4s\n"
+    "ldr s15, [x22, x23]\n"
+    "fmla v9.4s, v14.4s, v11.4s\n"
+    "ldr s19, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v8.4s, v18.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 168]\n"
+    "fmla v10.4s, v18.4s, v1.4s\n"
+    "ldr s13, [x21, x23]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr s18, [x20, x23]\n"
+    "fmla v7.4s, v13.4s, v12.4s\n"
+    "ldr s17, [x19, x23]\n"
+    "fmla v8.4s, v15.4s, v2.4s\n"
+    "ldr s15, [x22, x23]\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr x27, [%[inptrs], 136]\n"
+    "fmla v9.4s, v13.4s, v2.4s\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v8.4s, v19.4s, v4.4s\n"
+    "ldr s19, [x21, x23]\n"
+    "fmla v10.4s, v13.4s, v0.4s\n"
+    "ldr s12, [x20, x23]\n"
+    "fmla v9.4s, v18.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 176]\n"
+    "fmla v7.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 144]\n"
+    "fmla v8.4s, v13.4s, v5.4s\n"
+    "ldr s11, [x22, x23]\n"
+    "ldr s13, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v9.4s, v17.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 184]\n"
+    "fmla v7.4s, v19.4s, v6.4s\n"
+    "ldr s14, [x21, x23]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr s17, [x22, x23]\n"
+    "ldr x27, [%[inptrs], 152]\n"
+    "ldr x22, [%[inptrs], 192]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str s10, [x21, x24]\n"
+    "fmla v7.4s, v11.4s, v2.4s\n"
+    "fmla v8.4s, v16.4s, v3.4s\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr s15, [x22, x23]\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v9.4s, v12.4s, v3.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v7.4s, v13.4s, v4.4s\n"
+    "ldr s13, [%[wbptr]]\n"
+    "fmla v8.4s, v11.4s, v0.4s\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "mov v10.16b, v13.16b\n"
+    "ldr s6, [%[wbptr], #8]\n"
+    "fmla v9.4s, v14.4s, v0.4s\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "fmla v7.4s, v14.4s, v5.4s\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "str s8, [x28, x24]\n"
+    "add x23, x23, #4\n"
+    "mov v8.16b, v13.16b\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "str s9, [x21, x24]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "fmla v7.4s, v17.4s, v1.4s\n"
+    "ldr s5, [%[wbptr], #12]\n"
+    "mov v9.16b, v13.16b\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr x27, [%[inptrs], 120]\n"
+    "subs x25, x25, #1\n"
+    "fmla v7.4s, v16.4s, v3.4s\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "ldr s14, [x19, x23]\n"
+    "fmla v10.4s, v14.4s, v12.4s\n"
+    "ldr s18, [x20, x23]\n"
+    "ldr s14, [x21, x23]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "fmla v7.4s, v15.4s, v0.4s\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "ldr s19, [x19, x23]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "fmla v10.4s, v18.4s, v11.4s\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr s15, [x20, x23]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "str s7, [x28, x24]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "mov v7.16b, v13.16b\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr s13, [x19, x23]\n"
+    "ldr s18, [x21, x23]\n"
+    "add x24, x24, #4\n"
+    "fmla v10.4s, v14.4s, v2.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v8.4s, v14.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v10.4s, v15.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v9.4s, v13.4s, v12.4s\n"
+    "ldr s14, [x20, x23]\n"
+    "ldr s17, [x19, x23]\n"
+    "ldr x22, [%[inptrs], 160]\n"
+    "fmla v8.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 128]\n"
+    "fmla v10.4s, v13.4s, v5.4s\n"
+    "ldr s15, [x22, x23]\n"
+    "fmla v9.4s, v14.4s, v11.4s\n"
+    "ldr s19, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v8.4s, v18.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 168]\n"
+    "fmla v10.4s, v18.4s, v1.4s\n"
+    "ldr s13, [x21, x23]\n"
+    "fmla v9.4s, v17.4s, v6.4s\n"
+    "ldr s18, [x20, x23]\n"
+    "fmla v7.4s, v13.4s, v12.4s\n"
+    "ldr s17, [x19, x23]\n"
+    "fmla v8.4s, v15.4s, v2.4s\n"
+    "ldr s15, [x22, x23]\n"
+    "fmla v10.4s, v14.4s, v3.4s\n"
+    "ldr x27, [%[inptrs], 136]\n"
+    "fmla v9.4s, v13.4s, v2.4s\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v8.4s, v19.4s, v4.4s\n"
+    "ldr s19, [x21, x23]\n"
+    "fmla v10.4s, v13.4s, v0.4s\n"
+    "ldr s12, [x20, x23]\n"
+    "fmla v9.4s, v18.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 176]\n"
+    "fmla v7.4s, v16.4s, v11.4s\n"
+    "ldr x27, [%[inptrs], 144]\n"
+    "fmla v8.4s, v13.4s, v5.4s\n"
+    "ldr s11, [x22, x23]\n"
+    "ldr s13, [x27, x23]\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v9.4s, v17.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 184]\n"
+    "fmla v7.4s, v19.4s, v6.4s\n"
+    "ldr s14, [x21, x23]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr s17, [x22, x23]\n"
+    "ldr x27, [%[inptrs], 152]\n"
+    "ldr x22, [%[inptrs], 192]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str s10, [x21, x24]\n"
+    "fmla v7.4s, v11.4s, v2.4s\n"
+    "fmla v8.4s, v16.4s, v3.4s\n"
+    "ldr s16, [x27, x23]\n"
+    "ldr s15, [x22, x23]\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v9.4s, v12.4s, v3.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v7.4s, v13.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v8.4s, v11.4s, v0.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v9.4s, v14.4s, v0.4s\n"
+    "fmla v7.4s, v14.4s, v5.4s\n"
+    "str s8, [x28, x24]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "str s9, [x21, x24]\n"
+    "fmla v7.4s, v17.4s, v1.4s\n"
+    "fmla v7.4s, v16.4s, v3.4s\n"
+    "fmla v7.4s, v15.4s, v0.4s\n"
+    "str s7, [x28, x24]\n"
+    "add x24, x24, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr)
+    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
 template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x24, %[inptr0], %[input_row_stride]\n"
+    "add x27, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x19, %[outptr0], %[output_row_stride]\n"
+    "add x25, x24, %[input_row_stride]\n"
+    "add x23, x27, %[input_col_stride1]\n"
+    "and x20, %[n_channels], #3\n"
+    "add x28, x25, %[input_row_stride]\n"
+    "add x22, x23, %[input_col_stride1]\n"
+    "lsr x21, %[n_channels], #2\n"
+    "add x26, x28, %[input_row_stride]\n"
+    "cbz x21, 4f\n"
+    "1:\n"
+    "ldr q16, [%[wbptr]]\n"
+    "subs x21, x21, #1\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr q4, [%[wbptr], #16]\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr q5, [%[wbptr], #32]\n"
+    "mov v2.16b, v16.16b\n"
+    "ldr q12, [%[wbptr], #48]\n"
+    "mov v0.16b, v16.16b\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "ldr q6, [%[wbptr], #96]\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "ldr q7, [%[wbptr], #144]\n"
+    "ldr q21, [%[inptr0]]\n"
+    "fmla v3.4s, v21.4s, v4.4s\n"
+    "ldr q23, [x24]\n"
+    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q14, [x25]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr q13, [x24, %[input_col_stride1]]\n"
+    "fmla v3.4s, v23.4s, v11.4s\n"
+    "ldr q18, [%[inptr0], x27]\n"
+    "ldr q15, [x28]\n"
+    "ldr q22, [x25, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v5.4s\n"
+    "fmla v3.4s, v14.4s, v9.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v3.4s, v13.4s, v10.4s\n"
+    "ldr q17, [x24, x27]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr q20, [%[inptr0], x23]\n"
+    "fmla v1.4s, v15.4s, v11.4s\n"
+    "ldr q19, [x26]\n"
+    "fmla v3.4s, v18.4s, v12.4s\n"
+    "ldr q13, [x28, %[input_col_stride1]]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr q14, [x25, x27]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "ldr q15, [x24, x23]\n"
+    "fmla v3.4s, v22.4s, v8.4s\n"
+    "ldr q16, [%[inptr0], x22]\n"
+    "fmla v2.4s, v20.4s, v5.4s\n"
+    "ldr q20, [x26, %[input_col_stride1]]\n"
+    "fmla v1.4s, v19.4s, v9.4s\n"
+    "ldr q19, [x28, x27]\n"
+    "fmla v3.4s, v17.4s, v6.4s\n"
+    "ldr q21, [x25, x23]\n"
+    "fmla v2.4s, v14.4s, v9.4s\n"
+    "ldr q22, [x24, x22]\n"
+    "fmla v1.4s, v13.4s, v10.4s\n"
+    "ldr q23, [x26, x27]\n"
+    "fmla v3.4s, v14.4s, v7.4s\n"
+    "ldr q18, [x28, x23]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "ldr q13, [x25, x22]\n"
+    "fmla v1.4s, v14.4s, v12.4s\n"
+    "ldr q14, [x26, x23]\n"
+    "fmla v2.4s, v15.4s, v10.4s\n"
+    "ldr q17, [x28, x22]\n"
+    "fmla v0.4s, v19.4s, v11.4s\n"
+    "ldr q15, [x26, x22]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr q16, [%[wbptr]]\n"
+    "fmla v0.4s, v21.4s, v5.4s\n"
+    "ldr q4, [%[wbptr], #16]\n"
+    "fmla v1.4s, v19.4s, v6.4s\n"
+    "ldr q11, [%[wbptr], #64]\n"
+    "fmla v2.4s, v21.4s, v8.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "ldr q5, [%[wbptr], #32]\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v2.4s, v22.4s, v6.4s\n"
+    "ldr q21, [%[inptr0]]\n"
+    "fmla v0.4s, v18.4s, v10.4s\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "movi v20.16b, #0\n"
+    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v13.4s, v7.4s\n"
+    "ldr q18, [%[inptr0], x27]\n"
+    "fmla v0.4s, v13.4s, v12.4s\n"
+    "ldr q10, [%[wbptr], #80]\n"
+    "fmax v3.4s, v3.4s, v20.4s\n"
+    "add x24, x24, #16\n"
+    "fmax v2.4s, v2.4s, v20.4s\n"
+    "ldr q23, [x24]\n"
+    "str q3, [%[outptr0]]\n"
+    "fmla v0.4s, v14.4s, v8.4s\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v1.4s, v1.4s, v20.4s\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr q12, [%[wbptr], #48]\n"
+    "str q1, [x19]\n"
+    "fmla v0.4s, v17.4s, v6.4s\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "mov v2.16b, v16.16b\n"
+    "ldr q13, [x24, %[input_col_stride1]]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "ldr q6, [%[wbptr], #96]\n"
+    "fmla v3.4s, v21.4s, v4.4s\n"
+    "add x25, x25, #16\n"
+    "ldr q14, [x25]\n"
+    "add x28, x28, #16\n"
+    "fmax v0.4s, v0.4s, v20.4s\n"
+    "ldr q7, [%[wbptr], #144]\n"
+    "fmla v3.4s, v23.4s, v11.4s\n"
+    "ldr q15, [x28]\n"
+    "str q0, [x19, %[output_col_stride1]]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "mov v0.16b, v16.16b\n"
+    "ldr q22, [x25, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v5.4s\n"
+    "add x26, x26, #16\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "add x19, x19, #16\n"
+    "subs x21, x21, #1\n"
+    "fmla v3.4s, v14.4s, v9.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v3.4s, v13.4s, v10.4s\n"
+    "ldr q17, [x24, x27]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr q20, [%[inptr0], x23]\n"
+    "fmla v1.4s, v15.4s, v11.4s\n"
+    "ldr q19, [x26]\n"
+    "fmla v3.4s, v18.4s, v12.4s\n"
+    "ldr q13, [x28, %[input_col_stride1]]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr q14, [x25, x27]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "ldr q15, [x24, x23]\n"
+    "fmla v3.4s, v22.4s, v8.4s\n"
+    "ldr q16, [%[inptr0], x22]\n"
+    "fmla v2.4s, v20.4s, v5.4s\n"
+    "ldr q20, [x26, %[input_col_stride1]]\n"
+    "fmla v1.4s, v19.4s, v9.4s\n"
+    "ldr q19, [x28, x27]\n"
+    "fmla v3.4s, v17.4s, v6.4s\n"
+    "ldr q21, [x25, x23]\n"
+    "fmla v2.4s, v14.4s, v9.4s\n"
+    "ldr q22, [x24, x22]\n"
+    "fmla v1.4s, v13.4s, v10.4s\n"
+    "ldr q23, [x26, x27]\n"
+    "fmla v3.4s, v14.4s, v7.4s\n"
+    "ldr q18, [x28, x23]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "ldr q13, [x25, x22]\n"
+    "fmla v1.4s, v14.4s, v12.4s\n"
+    "ldr q14, [x26, x23]\n"
+    "fmla v2.4s, v15.4s, v10.4s\n"
+    "ldr q17, [x28, x22]\n"
+    "fmla v0.4s, v19.4s, v11.4s\n"
+    "ldr q15, [x26, x22]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v21.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v1.4s, v19.4s, v6.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v2.4s, v21.4s, v8.4s\n"
+    "add x25, x25, #16\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v2.4s, v22.4s, v6.4s\n"
+    "movi v20.16b, #0\n"
+    "fmla v0.4s, v18.4s, v10.4s\n"
+    "fmax v3.4s, v3.4s, v20.4s\n"
+    "fmla v2.4s, v13.4s, v7.4s\n"
+    "fmax v1.4s, v1.4s, v20.4s\n"
+    "str q3, [%[outptr0]]\n"
+    "fmla v0.4s, v13.4s, v12.4s\n"
+    "str q1, [x19]\n"
+    "fmax v2.4s, v2.4s, v20.4s\n"
+    "fmla v0.4s, v14.4s, v8.4s\n"
+    "str q2, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v0.4s, v17.4s, v6.4s\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmax v0.4s, v0.4s, v20.4s\n"
+    "str q0, [x19, %[output_col_stride1]]\n"
+    "add x19, x19, #16\n"
+    "4:\n"
+    "cbz x20, 7f\n"
+    "ldr s16, [%[wbptr]]\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr s4, [%[wbptr], #4]\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr s5, [%[wbptr], #8]\n"
+    "mov v2.16b, v16.16b\n"
+    "ldr s12, [%[wbptr], #12]\n"
+    "mov v0.16b, v16.16b\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "subs x20, x20, #1\n"
+    "ldr s6, [%[wbptr], #24]\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "ldr s7, [%[wbptr], #36]\n"
+    "ldr s21, [%[inptr0]]\n"
+    "ldr s23, [x24]\n"
+    "fmla v3.4s, v21.4s, v4.4s\n"
+    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s14, [x25]\n"
+    "ldr s13, [x24, %[input_col_stride1]]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr s18, [%[inptr0], x27]\n"
+    "fmla v3.4s, v23.4s, v11.4s\n"
+    "ldr s15, [x28]\n"
+    "ldr s22, [x25, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v5.4s\n"
+    "fmla v3.4s, v14.4s, v9.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v3.4s, v13.4s, v10.4s\n"
+    "ldr s17, [x24, x27]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr s20, [%[inptr0], x23]\n"
+    "fmla v1.4s, v15.4s, v11.4s\n"
+    "ldr s19, [x26]\n"
+    "fmla v3.4s, v18.4s, v12.4s\n"
+    "ldr s13, [x28, %[input_col_stride1]]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr s14, [x25, x27]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "ldr s15, [x24, x23]\n"
+    "fmla v3.4s, v22.4s, v8.4s\n"
+    "ldr s16, [%[inptr0], x22]\n"
+    "fmla v2.4s, v20.4s, v5.4s\n"
+    "ldr s20, [x26, %[input_col_stride1]]\n"
+    "fmla v1.4s, v19.4s, v9.4s\n"
+    "ldr s19, [x28, x27]\n"
+    "fmla v3.4s, v17.4s, v6.4s\n"
+    "ldr s21, [x25, x23]\n"
+    "fmla v2.4s, v14.4s, v9.4s\n"
+    "ldr s22, [x24, x22]\n"
+    "fmla v1.4s, v13.4s, v10.4s\n"
+    "ldr s23, [x26, x27]\n"
+    "fmla v3.4s, v14.4s, v7.4s\n"
+    "ldr s18, [x28, x23]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "ldr s13, [x25, x22]\n"
+    "fmla v1.4s, v14.4s, v12.4s\n"
+    "ldr s14, [x26, x23]\n"
+    "fmla v2.4s, v15.4s, v10.4s\n"
+    "ldr s17, [x28, x22]\n"
+    "fmla v0.4s, v19.4s, v11.4s\n"
+    "ldr s15, [x26, x22]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr s16, [%[wbptr]]\n"
+    "fmla v0.4s, v21.4s, v5.4s\n"
+    "ldr s4, [%[wbptr], #4]\n"
+    "fmla v1.4s, v19.4s, v6.4s\n"
+    "ldr s11, [%[wbptr], #16]\n"
+    "fmla v2.4s, v21.4s, v8.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "ldr s5, [%[wbptr], #8]\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v2.4s, v22.4s, v6.4s\n"
+    "ldr s21, [%[inptr0]]\n"
+    "fmla v0.4s, v18.4s, v10.4s\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "movi v20.16b, #0\n"
+    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v13.4s, v7.4s\n"
+    "ldr s18, [%[inptr0], x27]\n"
+    "fmla v0.4s, v13.4s, v12.4s\n"
+    "ldr s10, [%[wbptr], #20]\n"
+    "fmax v3.4s, v3.4s, v20.4s\n"
+    "add x24, x24, #4\n"
+    "fmax v2.4s, v2.4s, v20.4s\n"
+    "ldr s23, [x24]\n"
+    "str s3, [%[outptr0]]\n"
+    "fmla v0.4s, v14.4s, v8.4s\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v1.4s, v1.4s, v20.4s\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr s12, [%[wbptr], #12]\n"
+    "str s1, [x19]\n"
+    "fmla v0.4s, v17.4s, v6.4s\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "mov v2.16b, v16.16b\n"
+    "ldr s13, [x24, %[input_col_stride1]]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "ldr s6, [%[wbptr], #24]\n"
+    "fmla v3.4s, v21.4s, v4.4s\n"
+    "add x25, x25, #4\n"
+    "ldr s14, [x25]\n"
+    "add x28, x28, #4\n"
+    "fmax v0.4s, v0.4s, v20.4s\n"
+    "ldr s7, [%[wbptr], #36]\n"
+    "fmla v3.4s, v23.4s, v11.4s\n"
+    "ldr s15, [x28]\n"
+    "str s0, [x19, %[output_col_stride1]]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "mov v0.16b, v16.16b\n"
+    "ldr s22, [x25, %[input_col_stride1]]\n"
+    "fmla v3.4s, v19.4s, v5.4s\n"
+    "add x26, x26, #4\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "add x19, x19, #4\n"
+    "subs x20, x20, #1\n"
+    "fmla v3.4s, v14.4s, v9.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v3.4s, v13.4s, v10.4s\n"
+    "ldr s17, [x24, x27]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr s20, [%[inptr0], x23]\n"
+    "fmla v1.4s, v15.4s, v11.4s\n"
+    "ldr s19, [x26]\n"
+    "fmla v3.4s, v18.4s, v12.4s\n"
+    "ldr s13, [x28, %[input_col_stride1]]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr s14, [x25, x27]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "ldr s15, [x24, x23]\n"
+    "fmla v3.4s, v22.4s, v8.4s\n"
+    "ldr s16, [%[inptr0], x22]\n"
+    "fmla v2.4s, v20.4s, v5.4s\n"
+    "ldr s20, [x26, %[input_col_stride1]]\n"
+    "fmla v1.4s, v19.4s, v9.4s\n"
+    "ldr s19, [x28, x27]\n"
+    "fmla v3.4s, v17.4s, v6.4s\n"
+    "ldr s21, [x25, x23]\n"
+    "fmla v2.4s, v14.4s, v9.4s\n"
+    "ldr s22, [x24, x22]\n"
+    "fmla v1.4s, v13.4s, v10.4s\n"
+    "ldr s23, [x26, x27]\n"
+    "fmla v3.4s, v14.4s, v7.4s\n"
+    "ldr s18, [x28, x23]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "ldr s13, [x25, x22]\n"
+    "fmla v1.4s, v14.4s, v12.4s\n"
+    "ldr s14, [x26, x23]\n"
+    "fmla v2.4s, v15.4s, v10.4s\n"
+    "ldr s17, [x28, x22]\n"
+    "fmla v0.4s, v19.4s, v11.4s\n"
+    "ldr s15, [x26, x22]\n"
+    "fmla v1.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v21.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v1.4s, v19.4s, v6.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v2.4s, v21.4s, v8.4s\n"
+    "add x25, x25, #4\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v2.4s, v22.4s, v6.4s\n"
+    "movi v20.16b, #0\n"
+    "fmla v0.4s, v18.4s, v10.4s\n"
+    "fmax v3.4s, v3.4s, v20.4s\n"
+    "fmla v2.4s, v13.4s, v7.4s\n"
+    "fmax v1.4s, v1.4s, v20.4s\n"
+    "str s3, [%[outptr0]]\n"
+    "fmla v0.4s, v13.4s, v12.4s\n"
+    "str s1, [x19]\n"
+    "fmax v2.4s, v2.4s, v20.4s\n"
+    "fmla v0.4s, v14.4s, v8.4s\n"
+    "str s2, [%[outptr0], %[output_col_stride1]]\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v0.4s, v17.4s, v6.4s\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmax v0.4s, v0.4s, v20.4s\n"
+    "str s0, [x19, %[output_col_stride1]]\n"
+    "add x19, x19, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+    : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
 template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  __asm __volatile(
+    "mov x22, xzr\n"
+    "mov x26, xzr\n"
+    "and x23, %[n_channels], #3\n"
+    "lsr x24, %[n_channels], #2\n"
+    "cbz x24, 4f\n"
+    "1:\n"
+    "ldr q14, [%[wbptr]]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr q13, [%[wbptr], #16]\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "mov v2.16b, v14.16b\n"
+    "ldr q4, [%[wbptr], #48]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr q12, [%[wbptr], #64]\n"
+    "ldr q9, [%[wbptr], #80]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr q8, [%[wbptr], #96]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr q7, [%[wbptr], #112]\n"
+    "ldr x25, [%[inptrs], 120]\n"
+    "ldr q6, [%[wbptr], #128]\n"
+    "subs x24, x24, #1\n"
+    "ldr q5, [%[wbptr], #144]\n"
+    "ldr q15, [x19, x22]\n"
+    "fmla v3.4s, v15.4s, v13.4s\n"
+    "ldr q17, [x20, x22]\n"
+    "ldr q16, [x21, x22]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr q15, [x25, x22]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "ldr q10, [x19, x22]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr q17, [x20, x22]\n"
+    "ldr q14, [x21, x22]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "ldr q18, [x19, x22]\n"
+    "fmla v3.4s, v10.4s, v11.4s\n"
+    "fmla v3.4s, v16.4s, v7.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v1.4s, v16.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v3.4s, v17.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v2.4s, v18.4s, v13.4s\n"
+    "ldr q16, [x20, x22]\n"
+    "movi v10.16b, #0\n"
+    "ldr q17, [x19, x22]\n"
+    "fmla v1.4s, v15.4s, v12.4s\n"
+    "ldr x27, [%[inptrs], 160]\n"
+    "fmla v3.4s, v18.4s, v4.4s\n"
+    "ldr x25, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr q18, [x27, x22]\n"
+    "ldr q15, [x25, x22]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "fmla v1.4s, v14.4s, v11.4s\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "fmla v3.4s, v14.4s, v6.4s\n"
+    "ldr q14, [x21, x22]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr q17, [x20, x22]\n"
+    "fmla v0.4s, v14.4s, v13.4s\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v1.4s, v18.4s, v7.4s\n"
+    "ldr x27, [%[inptrs], 168]\n"
+    "fmla v3.4s, v16.4s, v8.4s\n"
+    "ldr q18, [x19, x22]\n"
+    "fmla v2.4s, v14.4s, v7.4s\n"
+    "ldr q13, [x27, x22]\n"
+    "ldr x25, [%[inptrs], 136]\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v1.4s, v15.4s, v9.4s\n"
+    "ldr x27, [%[inptrs], 176]\n"
+    "fmla v3.4s, v14.4s, v5.4s\n"
+    "ldr q16, [x25, x22]\n"
+    "fmla v2.4s, v17.4s, v9.4s\n"
+    "ldr q17, [x21, x22]\n"
+    "fmla v0.4s, v16.4s, v12.4s\n"
+    "ldr q12, [x20, x22]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr q15, [x27, x22]\n"
+    "fmax v3.4s, v3.4s, v10.4s\n"
+    "ldr x25, [%[inptrs], 144]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v0.4s, v17.4s, v11.4s\n"
+    "ldr q14, [x25, x22]\n"
+    "fmla v1.4s, v13.4s, v6.4s\n"
+    "ldr q11, [x21, x22]\n"
+    "ldr x27, [%[inptrs], 184]\n"
+    "ldr x25, [%[inptrs], 152]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str q3, [x21, x26]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmla v1.4s, v16.4s, v8.4s\n"
+    "ldr q18, [x27, x22]\n"
+    "ldr q17, [x25, x22]\n"
+    "ldr x27, [%[inptrs], 192]\n"
+    "fmla v2.4s, v12.4s, v8.4s\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v0.4s, v14.4s, v9.4s\n"
+    "ldr q16, [x27, x22]\n"
+    "fmla v1.4s, v15.4s, v5.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "ldr q14, [%[wbptr]]\n"
+    "add x22, x22, #16\n"
+    "fmla v2.4s, v11.4s, v5.4s\n"
+    "ldr q13, [%[wbptr], #16]\n"
+    "fmla v0.4s, v11.4s, v4.4s\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "fmax v1.4s, v1.4s, v10.4s\n"
+    "ldr q12, [%[wbptr], #64]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr q9, [%[wbptr], #80]\n"
+    "fmax v2.4s, v2.4s, v10.4s\n"
+    "ldr q7, [%[wbptr], #112]\n"
+    "str q1, [x28, x26]\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr q4, [%[wbptr], #48]\n"
+    "str q2, [x21, x26]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "mov v2.16b, v14.16b\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v17.4s, v8.4s\n"
+    "ldr q6, [%[wbptr], #128]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr x25, [%[inptrs], 120]\n"
+    "subs x24, x24, #1\n"
+    "ldr q15, [x19, x22]\n"
+    "fmla v0.4s, v16.4s, v5.4s\n"
+    "ldr q8, [%[wbptr], #96]\n"
+    "fmla v3.4s, v15.4s, v13.4s\n"
+    "ldr q17, [x20, x22]\n"
+    "ldr q16, [x21, x22]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr q15, [x25, x22]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "fmax v0.4s, v0.4s, v10.4s\n"
+    "ldr q5, [%[wbptr], #144]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr q10, [x19, x22]\n"
+    "ldr q17, [x20, x22]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "str q0, [x28, x26]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr q18, [x19, x22]\n"
+    "fmla v3.4s, v10.4s, v11.4s\n"
+    "ldr q14, [x21, x22]\n"
+    "add x26, x26, #16\n"
+    "fmla v3.4s, v16.4s, v7.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v1.4s, v16.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v3.4s, v17.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v2.4s, v18.4s, v13.4s\n"
+    "ldr q16, [x20, x22]\n"
+    "movi v10.16b, #0\n"
+    "ldr q17, [x19, x22]\n"
+    "fmla v1.4s, v15.4s, v12.4s\n"
+    "ldr x27, [%[inptrs], 160]\n"
+    "fmla v3.4s, v18.4s, v4.4s\n"
+    "ldr x25, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr q18, [x27, x22]\n"
+    "ldr q15, [x25, x22]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "fmla v1.4s, v14.4s, v11.4s\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "fmla v3.4s, v14.4s, v6.4s\n"
+    "ldr q14, [x21, x22]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr q17, [x20, x22]\n"
+    "fmla v0.4s, v14.4s, v13.4s\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v1.4s, v18.4s, v7.4s\n"
+    "ldr x27, [%[inptrs], 168]\n"
+    "fmla v3.4s, v16.4s, v8.4s\n"
+    "ldr q18, [x19, x22]\n"
+    "fmla v2.4s, v14.4s, v7.4s\n"
+    "ldr q13, [x27, x22]\n"
+    "ldr x25, [%[inptrs], 136]\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v1.4s, v15.4s, v9.4s\n"
+    "ldr x27, [%[inptrs], 176]\n"
+    "fmla v3.4s, v14.4s, v5.4s\n"
+    "ldr q16, [x25, x22]\n"
+    "fmla v2.4s, v17.4s, v9.4s\n"
+    "ldr q17, [x21, x22]\n"
+    "fmla v0.4s, v16.4s, v12.4s\n"
+    "ldr q12, [x20, x22]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr q15, [x27, x22]\n"
+    "fmax v3.4s, v3.4s, v10.4s\n"
+    "ldr x25, [%[inptrs], 144]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v0.4s, v17.4s, v11.4s\n"
+    "ldr q14, [x25, x22]\n"
+    "fmla v1.4s, v13.4s, v6.4s\n"
+    "ldr q11, [x21, x22]\n"
+    "ldr x27, [%[inptrs], 184]\n"
+    "ldr x25, [%[inptrs], 152]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str q3, [x21, x26]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmla v1.4s, v16.4s, v8.4s\n"
+    "ldr q18, [x27, x22]\n"
+    "ldr q17, [x25, x22]\n"
+    "ldr x27, [%[inptrs], 192]\n"
+    "fmla v2.4s, v12.4s, v8.4s\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v0.4s, v14.4s, v9.4s\n"
+    "ldr q16, [x27, x22]\n"
+    "fmla v1.4s, v15.4s, v5.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "add x22, x22, #16\n"
+    "fmla v2.4s, v11.4s, v5.4s\n"
+    "fmla v0.4s, v11.4s, v4.4s\n"
+    "fmax v1.4s, v1.4s, v10.4s\n"
+    "fmax v2.4s, v2.4s, v10.4s\n"
+    "str q1, [x28, x26]\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "str q2, [x21, x26]\n"
+    "fmla v0.4s, v17.4s, v8.4s\n"
+    "fmla v0.4s, v16.4s, v5.4s\n"
+    "fmax v0.4s, v0.4s, v10.4s\n"
+    "str q0, [x28, x26]\n"
+    "add x26, x26, #16\n"
+    "4:\n"
+    "cbz x23, 7f\n"
+    "ldr s14, [%[wbptr]]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr s13, [%[wbptr], #4]\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "mov v2.16b, v14.16b\n"
+    "ldr s4, [%[wbptr], #12]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr s12, [%[wbptr], #16]\n"
+    "ldr s9, [%[wbptr], #20]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr s8, [%[wbptr], #24]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr s7, [%[wbptr], #28]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr s6, [%[wbptr], #32]\n"
+    "ldr x25, [%[inptrs], 120]\n"
+    "ldr s5, [%[wbptr], #36]\n"
+    "subs x23, x23, #1\n"
+    "ldr s15, [x19, x22]\n"
+    "ldr s17, [x20, x22]\n"
+    "fmla v3.4s, v15.4s, v13.4s\n"
+    "ldr s16, [x21, x22]\n"
+    "ldr s15, [x25, x22]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "ldr s10, [x19, x22]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr s17, [x20, x22]\n"
+    "ldr s14, [x21, x22]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "ldr s18, [x19, x22]\n"
+    "fmla v3.4s, v10.4s, v11.4s\n"
+    "fmla v3.4s, v16.4s, v7.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v1.4s, v16.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v3.4s, v17.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v2.4s, v18.4s, v13.4s\n"
+    "ldr s16, [x20, x22]\n"
+    "movi v10.16b, #0\n"
+    "ldr s17, [x19, x22]\n"
+    "fmla v1.4s, v15.4s, v12.4s\n"
+    "ldr x27, [%[inptrs], 160]\n"
+    "fmla v3.4s, v18.4s, v4.4s\n"
+    "ldr x25, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr s18, [x27, x22]\n"
+    "ldr s15, [x25, x22]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "fmla v1.4s, v14.4s, v11.4s\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "fmla v3.4s, v14.4s, v6.4s\n"
+    "ldr s14, [x21, x22]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr s17, [x20, x22]\n"
+    "fmla v0.4s, v14.4s, v13.4s\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v1.4s, v18.4s, v7.4s\n"
+    "ldr x27, [%[inptrs], 168]\n"
+    "fmla v3.4s, v16.4s, v8.4s\n"
+    "ldr s18, [x19, x22]\n"
+    "fmla v2.4s, v14.4s, v7.4s\n"
+    "ldr s13, [x27, x22]\n"
+    "ldr x25, [%[inptrs], 136]\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v1.4s, v15.4s, v9.4s\n"
+    "ldr x27, [%[inptrs], 176]\n"
+    "fmla v3.4s, v14.4s, v5.4s\n"
+    "ldr s16, [x25, x22]\n"
+    "fmla v2.4s, v17.4s, v9.4s\n"
+    "ldr s17, [x21, x22]\n"
+    "fmla v0.4s, v16.4s, v12.4s\n"
+    "ldr s12, [x20, x22]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr s15, [x27, x22]\n"
+    "fmax v3.4s, v3.4s, v10.4s\n"
+    "ldr x25, [%[inptrs], 144]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v0.4s, v17.4s, v11.4s\n"
+    "ldr s14, [x25, x22]\n"
+    "fmla v1.4s, v13.4s, v6.4s\n"
+    "ldr s11, [x21, x22]\n"
+    "ldr x27, [%[inptrs], 184]\n"
+    "ldr x25, [%[inptrs], 152]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str s3, [x21, x26]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmla v1.4s, v16.4s, v8.4s\n"
+    "ldr s18, [x27, x22]\n"
+    "ldr s17, [x25, x22]\n"
+    "ldr x27, [%[inptrs], 192]\n"
+    "fmla v2.4s, v12.4s, v8.4s\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v0.4s, v14.4s, v9.4s\n"
+    "ldr s16, [x27, x22]\n"
+    "fmla v1.4s, v15.4s, v5.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "ldr s14, [%[wbptr]]\n"
+    "add x22, x22, #4\n"
+    "fmla v2.4s, v11.4s, v5.4s\n"
+    "ldr s13, [%[wbptr], #4]\n"
+    "fmla v0.4s, v11.4s, v4.4s\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "fmax v1.4s, v1.4s, v10.4s\n"
+    "ldr s12, [%[wbptr], #16]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr s9, [%[wbptr], #20]\n"
+    "fmax v2.4s, v2.4s, v10.4s\n"
+    "ldr s7, [%[wbptr], #28]\n"
+    "str s1, [x28, x26]\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr s4, [%[wbptr], #12]\n"
+    "str s2, [x21, x26]\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "mov v2.16b, v14.16b\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v0.4s, v17.4s, v8.4s\n"
+    "ldr s6, [%[wbptr], #32]\n"
+    "ldr x19, [%[inptrs], 0]\n"
+    "ldr x20, [%[inptrs], 40]\n"
+    "ldr x21, [%[inptrs], 80]\n"
+    "ldr x25, [%[inptrs], 120]\n"
+    "subs x23, x23, #1\n"
+    "ldr s15, [x19, x22]\n"
+    "fmla v0.4s, v16.4s, v5.4s\n"
+    "ldr s8, [%[wbptr], #24]\n"
+    "fmla v3.4s, v15.4s, v13.4s\n"
+    "ldr s17, [x20, x22]\n"
+    "ldr s16, [x21, x22]\n"
+    "ldr x19, [%[inptrs], 8]\n"
+    "ldr s15, [x25, x22]\n"
+    "ldr x20, [%[inptrs], 48]\n"
+    "fmax v0.4s, v0.4s, v10.4s\n"
+    "ldr s5, [%[wbptr], #36]\n"
+    "fmla v3.4s, v17.4s, v12.4s\n"
+    "ldr s10, [x19, x22]\n"
+    "ldr s17, [x20, x22]\n"
+    "ldr x19, [%[inptrs], 16]\n"
+    "str s0, [x28, x26]\n"
+    "ldr x21, [%[inptrs], 88]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr s18, [x19, x22]\n"
+    "fmla v3.4s, v10.4s, v11.4s\n"
+    "ldr s14, [x21, x22]\n"
+    "add x26, x26, #4\n"
+    "fmla v3.4s, v16.4s, v7.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v1.4s, v16.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 56]\n"
+    "fmla v3.4s, v17.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 24]\n"
+    "fmla v2.4s, v18.4s, v13.4s\n"
+    "ldr s16, [x20, x22]\n"
+    "movi v10.16b, #0\n"
+    "ldr s17, [x19, x22]\n"
+    "fmla v1.4s, v15.4s, v12.4s\n"
+    "ldr x27, [%[inptrs], 160]\n"
+    "fmla v3.4s, v18.4s, v4.4s\n"
+    "ldr x25, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v12.4s\n"
+    "ldr s18, [x27, x22]\n"
+    "ldr s15, [x25, x22]\n"
+    "ldr x21, [%[inptrs], 96]\n"
+    "fmla v1.4s, v14.4s, v11.4s\n"
+    "ldr x20, [%[inptrs], 64]\n"
+    "fmla v3.4s, v14.4s, v6.4s\n"
+    "ldr s14, [x21, x22]\n"
+    "fmla v2.4s, v17.4s, v11.4s\n"
+    "ldr s17, [x20, x22]\n"
+    "fmla v0.4s, v14.4s, v13.4s\n"
+    "ldr x19, [%[inptrs], 32]\n"
+    "fmla v1.4s, v18.4s, v7.4s\n"
+    "ldr x27, [%[inptrs], 168]\n"
+    "fmla v3.4s, v16.4s, v8.4s\n"
+    "ldr s18, [x19, x22]\n"
+    "fmla v2.4s, v14.4s, v7.4s\n"
+    "ldr s13, [x27, x22]\n"
+    "ldr x25, [%[inptrs], 136]\n"
+    "ldr x21, [%[inptrs], 104]\n"
+    "ldr x20, [%[inptrs], 72]\n"
+    "fmla v1.4s, v15.4s, v9.4s\n"
+    "ldr x27, [%[inptrs], 176]\n"
+    "fmla v3.4s, v14.4s, v5.4s\n"
+    "ldr s16, [x25, x22]\n"
+    "fmla v2.4s, v17.4s, v9.4s\n"
+    "ldr s17, [x21, x22]\n"
+    "fmla v0.4s, v16.4s, v12.4s\n"
+    "ldr s12, [x20, x22]\n"
+    "fmla v1.4s, v14.4s, v4.4s\n"
+    "ldr s15, [x27, x22]\n"
+    "fmax v3.4s, v3.4s, v10.4s\n"
+    "ldr x25, [%[inptrs], 144]\n"
+    "fmla v2.4s, v18.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 112]\n"
+    "fmla v0.4s, v17.4s, v11.4s\n"
+    "ldr s14, [x25, x22]\n"
+    "fmla v1.4s, v13.4s, v6.4s\n"
+    "ldr s11, [x21, x22]\n"
+    "ldr x27, [%[inptrs], 184]\n"
+    "ldr x25, [%[inptrs], 152]\n"
+    "ldr x21, [%[outptrs], 0]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 16]\n"
+    "str s3, [x21, x26]\n"
+    "fmla v0.4s, v15.4s, v7.4s\n"
+    "fmla v1.4s, v16.4s, v8.4s\n"
+    "ldr s18, [x27, x22]\n"
+    "ldr s17, [x25, x22]\n"
+    "ldr x27, [%[inptrs], 192]\n"
+    "fmla v2.4s, v12.4s, v8.4s\n"
+    "ldr x21, [%[outptrs], 8]\n"
+    "fmla v0.4s, v14.4s, v9.4s\n"
+    "ldr s16, [x27, x22]\n"
+    "fmla v1.4s, v15.4s, v5.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "add x22, x22, #4\n"
+    "fmla v2.4s, v11.4s, v5.4s\n"
+    "fmla v0.4s, v11.4s, v4.4s\n"
+    "fmax v1.4s, v1.4s, v10.4s\n"
+    "fmax v2.4s, v2.4s, v10.4s\n"
+    "str s1, [x28, x26]\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "ldr x28, [%[outptrs], 24]\n"
+    "str s2, [x21, x26]\n"
+    "fmla v0.4s, v17.4s, v8.4s\n"
+    "fmla v0.4s, v16.4s, v5.4s\n"
+    "fmax v0.4s, v0.4s, v10.4s\n"
+    "str s0, [x28, x26]\n"
+    "add x26, x26, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr)
+    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
 template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-  },
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x21, %[inptr0], %[input_row_stride]\n"
+    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x24, %[outptr0], %[output_row_stride]\n"
+    "add x28, x21, %[input_row_stride]\n"
+    "add x26, x23, %[input_col_stride1]\n"
+    "and x19, %[n_channels], #3\n"
+    "add x27, x28, %[input_row_stride]\n"
+    "add x25, x26, %[input_col_stride1]\n"
+    "lsr x20, %[n_channels], #2\n"
+    "add x22, x27, %[input_row_stride]\n"
+    "cbz x20, 4f\n"
+    "1:\n"
+    "ldr q14, [%[wbptr]]\n"
+    "subs x20, x20, #1\n"
+    "mov v5.16b, v14.16b\n"
+    "ldr q0, [%[wbptr], #16]\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr q1, [%[wbptr], #32]\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr q2, [%[wbptr], #48]\n"
+    "mov v10.16b, v14.16b\n"
+    "ldr q6, [%[wbptr], #64]\n"
+    "ldr q3, [%[wbptr], #80]\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "ldr q4, [%[wbptr], #112]\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "ldr q9, [%[wbptr], #144]\n"
+    "ldr q19, [%[inptr0]]\n"
+    "fmla v5.4s, v19.4s, v0.4s\n"
+    "ldr q15, [x21]\n"
+    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q16, [x28]\n"
+    "fmla v11.4s, v16.4s, v0.4s\n"
+    "ldr q23, [x21, %[input_col_stride1]]\n"
+    "fmla v5.4s, v15.4s, v6.4s\n"
+    "ldr q18, [%[inptr0], x23]\n"
+    "ldr q17, [x27]\n"
+    "ldr q13, [x28, %[input_col_stride1]]\n"
+    "fmla v5.4s, v21.4s, v1.4s\n"
+    "fmla v5.4s, v16.4s, v4.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v5.4s, v23.4s, v3.4s\n"
+    "ldr q21, [x21, x23]\n"
+    "fmla v12.4s, v18.4s, v0.4s\n"
+    "ldr q20, [%[inptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v6.4s\n"
+    "ldr q19, [x22]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr q15, [x27, %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v6.4s\n"
+    "ldr q16, [x28, x23]\n"
+    "fmla v11.4s, v13.4s, v1.4s\n"
+    "ldr q17, [x21, x26]\n"
+    "fmla v5.4s, v13.4s, v8.4s\n"
+    "ldr q14, [%[inptr0], x25]\n"
+    "fmla v12.4s, v20.4s, v1.4s\n"
+    "ldr q20, [x22, %[input_col_stride1]]\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr q19, [x27, x23]\n"
+    "fmla v5.4s, v21.4s, v7.4s\n"
+    "ldr q22, [x28, x26]\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "ldr q21, [x21, x25]\n"
+    "fmla v11.4s, v15.4s, v3.4s\n"
+    "ldr q23, [x22, x23]\n"
+    "fmla v5.4s, v16.4s, v9.4s\n"
+    "ldr q18, [x27, x26]\n"
+    "fmla v10.4s, v16.4s, v0.4s\n"
+    "ldr q15, [x28, x25]\n"
+    "fmla v11.4s, v16.4s, v2.4s\n"
+    "ldr q16, [x22, x26]\n"
+    "fmla v12.4s, v17.4s, v3.4s\n"
+    "ldr q17, [x27, x25]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr q13, [x22, x25]\n"
+    "fmla v11.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v12.4s, v14.4s, v2.4s\n"
+    "ldr q14, [%[wbptr]]\n"
+    "fmla v10.4s, v22.4s, v1.4s\n"
+    "ldr q0, [%[wbptr], #16]\n"
+    "fmla v11.4s, v19.4s, v7.4s\n"
+    "ldr q6, [%[wbptr], #64]\n"
+    "fmla v12.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v10.4s, v23.4s, v4.4s\n"
+    "ldr q1, [%[wbptr], #32]\n"
+    "fmla v11.4s, v23.4s, v9.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v12.4s, v21.4s, v7.4s\n"
+    "ldr q19, [%[inptr0]]\n"
+    "fmla v10.4s, v18.4s, v3.4s\n"
+    "ldr q4, [%[wbptr], #112]\n"
+    "movi v20.16b, #0\n"
+    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v12.4s, v15.4s, v9.4s\n"
+    "ldr q18, [%[inptr0], x23]\n"
+    "fmla v10.4s, v15.4s, v2.4s\n"
+    "ldr q3, [%[wbptr], #80]\n"
+    "fmov v22.4s, #6.0\n"
+    "add x21, x21, #16\n"
+    "fmax v5.4s, v5.4s, v20.4s\n"
+    "ldr q15, [x21]\n"
+    "fmla v10.4s, v16.4s, v8.4s\n"
+    "ldr q2, [%[wbptr], #48]\n"
+    "fmin v5.4s, v5.4s, v22.4s\n"
+    "ldr q23, [x21, %[input_col_stride1]]\n"
+    "fmax v12.4s, v12.4s, v20.4s\n"
+    "add x28, x28, #16\n"
+    "str q5, [%[outptr0]]\n"
+    "fmla v10.4s, v17.4s, v7.4s\n"
+    "fmin v12.4s, v12.4s, v22.4s\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "fmax v11.4s, v11.4s, v20.4s\n"
+    "ldr q16, [x28]\n"
+    "str q12, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v13.4s, v9.4s\n"
+    "fmin v11.4s, v11.4s, v22.4s\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "mov v5.16b, v14.16b\n"
+    "ldr q13, [x28, %[input_col_stride1]]\n"
+    "str q11, [x24]\n"
+    "fmax v10.4s, v10.4s, v20.4s\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr q9, [%[wbptr], #144]\n"
+    "fmin v10.4s, v10.4s, v22.4s\n"
+    "add x27, x27, #16\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr q17, [x27]\n"
+    "str q10, [x24, %[output_col_stride1]]\n"
+    "fmla v5.4s, v19.4s, v0.4s\n"
+    "mov v10.16b, v14.16b\n"
+    "add x22, x22, #16\n"
+    "fmla v11.4s, v16.4s, v0.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v5.4s, v15.4s, v6.4s\n"
+    "add x24, x24, #16\n"
+    "subs x20, x20, #1\n"
+    "fmla v5.4s, v21.4s, v1.4s\n"
+    "fmla v5.4s, v16.4s, v4.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v5.4s, v23.4s, v3.4s\n"
+    "ldr q21, [x21, x23]\n"
+    "fmla v12.4s, v18.4s, v0.4s\n"
+    "ldr q20, [%[inptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v6.4s\n"
+    "ldr q19, [x22]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr q15, [x27, %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v6.4s\n"
+    "ldr q16, [x28, x23]\n"
+    "fmla v11.4s, v13.4s, v1.4s\n"
+    "ldr q17, [x21, x26]\n"
+    "fmla v5.4s, v13.4s, v8.4s\n"
+    "ldr q14, [%[inptr0], x25]\n"
+    "fmla v12.4s, v20.4s, v1.4s\n"
+    "ldr q20, [x22, %[input_col_stride1]]\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr q19, [x27, x23]\n"
+    "fmla v5.4s, v21.4s, v7.4s\n"
+    "ldr q22, [x28, x26]\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "ldr q21, [x21, x25]\n"
+    "fmla v11.4s, v15.4s, v3.4s\n"
+    "ldr q23, [x22, x23]\n"
+    "fmla v5.4s, v16.4s, v9.4s\n"
+    "ldr q18, [x27, x26]\n"
+    "fmla v10.4s, v16.4s, v0.4s\n"
+    "ldr q15, [x28, x25]\n"
+    "fmla v11.4s, v16.4s, v2.4s\n"
+    "ldr q16, [x22, x26]\n"
+    "fmla v12.4s, v17.4s, v3.4s\n"
+    "ldr q17, [x27, x25]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr q13, [x22, x25]\n"
+    "fmla v11.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v12.4s, v14.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v10.4s, v22.4s, v1.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v11.4s, v19.4s, v7.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v12.4s, v22.4s, v8.4s\n"
+    "add x28, x28, #16\n"
+    "fmla v10.4s, v23.4s, v4.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v11.4s, v23.4s, v9.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v12.4s, v21.4s, v7.4s\n"
+    "movi v20.16b, #0\n"
+    "fmla v10.4s, v18.4s, v3.4s\n"
+    "fmov v22.4s, #6.0\n"
+    "fmax v5.4s, v5.4s, v20.4s\n"
+    "fmax v11.4s, v11.4s, v20.4s\n"
+    "fmla v12.4s, v15.4s, v9.4s\n"
+    "fmla v10.4s, v15.4s, v2.4s\n"
+    "fmin v5.4s, v5.4s, v22.4s\n"
+    "fmin v11.4s, v11.4s, v22.4s\n"
+    "fmax v12.4s, v12.4s, v20.4s\n"
+    "str q5, [%[outptr0]]\n"
+    "str q11, [x24]\n"
+    "fmla v10.4s, v16.4s, v8.4s\n"
+    "fmin v12.4s, v12.4s, v22.4s\n"
+    "str q12, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v17.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v10.4s, v13.4s, v9.4s\n"
+    "fmax v10.4s, v10.4s, v20.4s\n"
+    "fmin v10.4s, v10.4s, v22.4s\n"
+    "str q10, [x24, %[output_col_stride1]]\n"
+    "add x24, x24, #16\n"
+    "4:\n"
+    "cbz x19, 7f\n"
+    "ldr s14, [%[wbptr]]\n"
+    "mov v5.16b, v14.16b\n"
+    "ldr s0, [%[wbptr], #4]\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr s1, [%[wbptr], #8]\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr s2, [%[wbptr], #12]\n"
+    "mov v10.16b, v14.16b\n"
+    "ldr s6, [%[wbptr], #16]\n"
+    "ldr s3, [%[wbptr], #20]\n"
+    "subs x19, x19, #1\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "ldr s4, [%[wbptr], #28]\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "ldr s9, [%[wbptr], #36]\n"
+    "ldr s19, [%[inptr0]]\n"
+    "ldr s15, [x21]\n"
+    "fmla v5.4s, v19.4s, v0.4s\n"
+    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s16, [x28]\n"
+    "ldr s23, [x21, %[input_col_stride1]]\n"
+    "fmla v11.4s, v16.4s, v0.4s\n"
+    "ldr s18, [%[inptr0], x23]\n"
+    "fmla v5.4s, v15.4s, v6.4s\n"
+    "ldr s17, [x27]\n"
+    "ldr s13, [x28, %[input_col_stride1]]\n"
+    "fmla v5.4s, v21.4s, v1.4s\n"
+    "fmla v5.4s, v16.4s, v4.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v5.4s, v23.4s, v3.4s\n"
+    "ldr s21, [x21, x23]\n"
+    "fmla v12.4s, v18.4s, v0.4s\n"
+    "ldr s20, [%[inptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v6.4s\n"
+    "ldr s19, [x22]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr s15, [x27, %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v6.4s\n"
+    "ldr s16, [x28, x23]\n"
+    "fmla v11.4s, v13.4s, v1.4s\n"
+    "ldr s17, [x21, x26]\n"
+    "fmla v5.4s, v13.4s, v8.4s\n"
+    "ldr s14, [%[inptr0], x25]\n"
+    "fmla v12.4s, v20.4s, v1.4s\n"
+    "ldr s20, [x22, %[input_col_stride1]]\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr s19, [x27, x23]\n"
+    "fmla v5.4s, v21.4s, v7.4s\n"
+    "ldr s22, [x28, x26]\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "ldr s21, [x21, x25]\n"
+    "fmla v11.4s, v15.4s, v3.4s\n"
+    "ldr s23, [x22, x23]\n"
+    "fmla v5.4s, v16.4s, v9.4s\n"
+    "ldr s18, [x27, x26]\n"
+    "fmla v10.4s, v16.4s, v0.4s\n"
+    "ldr s15, [x28, x25]\n"
+    "fmla v11.4s, v16.4s, v2.4s\n"
+    "ldr s16, [x22, x26]\n"
+    "fmla v12.4s, v17.4s, v3.4s\n"
+    "ldr s17, [x27, x25]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr s13, [x22, x25]\n"
+    "fmla v11.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v12.4s, v14.4s, v2.4s\n"
+    "ldr s14, [%[wbptr]]\n"
+    "fmla v10.4s, v22.4s, v1.4s\n"
+    "ldr s0, [%[wbptr], #4]\n"
+    "fmla v11.4s, v19.4s, v7.4s\n"
+    "ldr s6, [%[wbptr], #16]\n"
+    "fmla v12.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v10.4s, v23.4s, v4.4s\n"
+    "ldr s1, [%[wbptr], #8]\n"
+    "fmla v11.4s, v23.4s, v9.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v12.4s, v21.4s, v7.4s\n"
+    "ldr s19, [%[inptr0]]\n"
+    "fmla v10.4s, v18.4s, v3.4s\n"
+    "ldr s4, [%[wbptr], #28]\n"
+    "movi v20.16b, #0\n"
+    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v12.4s, v15.4s, v9.4s\n"
+    "ldr s18, [%[inptr0], x23]\n"
+    "fmla v10.4s, v15.4s, v2.4s\n"
+    "ldr s3, [%[wbptr], #20]\n"
+    "fmov v22.4s, #6.0\n"
+    "add x21, x21, #4\n"
+    "fmax v5.4s, v5.4s, v20.4s\n"
+    "ldr s15, [x21]\n"
+    "fmla v10.4s, v16.4s, v8.4s\n"
+    "ldr s2, [%[wbptr], #12]\n"
+    "fmin v5.4s, v5.4s, v22.4s\n"
+    "ldr s23, [x21, %[input_col_stride1]]\n"
+    "fmax v12.4s, v12.4s, v20.4s\n"
+    "add x28, x28, #4\n"
+    "str s5, [%[outptr0]]\n"
+    "fmla v10.4s, v17.4s, v7.4s\n"
+    "fmin v12.4s, v12.4s, v22.4s\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "fmax v11.4s, v11.4s, v20.4s\n"
+    "ldr s16, [x28]\n"
+    "str s12, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v13.4s, v9.4s\n"
+    "fmin v11.4s, v11.4s, v22.4s\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "mov v5.16b, v14.16b\n"
+    "ldr s13, [x28, %[input_col_stride1]]\n"
+    "str s11, [x24]\n"
+    "fmax v10.4s, v10.4s, v20.4s\n"
+    "mov v11.16b, v14.16b\n"
+    "ldr s9, [%[wbptr], #36]\n"
+    "fmin v10.4s, v10.4s, v22.4s\n"
+    "add x27, x27, #4\n"
+    "mov v12.16b, v14.16b\n"
+    "ldr s17, [x27]\n"
+    "str s10, [x24, %[output_col_stride1]]\n"
+    "fmla v5.4s, v19.4s, v0.4s\n"
+    "mov v10.16b, v14.16b\n"
+    "add x22, x22, #4\n"
+    "fmla v11.4s, v16.4s, v0.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v5.4s, v15.4s, v6.4s\n"
+    "add x24, x24, #4\n"
+    "subs x19, x19, #1\n"
+    "fmla v5.4s, v21.4s, v1.4s\n"
+    "fmla v5.4s, v16.4s, v4.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v5.4s, v23.4s, v3.4s\n"
+    "ldr s21, [x21, x23]\n"
+    "fmla v12.4s, v18.4s, v0.4s\n"
+    "ldr s20, [%[inptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v6.4s\n"
+    "ldr s19, [x22]\n"
+    "fmla v5.4s, v18.4s, v2.4s\n"
+    "ldr s15, [x27, %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v6.4s\n"
+    "ldr s16, [x28, x23]\n"
+    "fmla v11.4s, v13.4s, v1.4s\n"
+    "ldr s17, [x21, x26]\n"
+    "fmla v5.4s, v13.4s, v8.4s\n"
+    "ldr s14, [%[inptr0], x25]\n"
+    "fmla v12.4s, v20.4s, v1.4s\n"
+    "ldr s20, [x22, %[input_col_stride1]]\n"
+    "fmla v11.4s, v19.4s, v4.4s\n"
+    "ldr s19, [x27, x23]\n"
+    "fmla v5.4s, v21.4s, v7.4s\n"
+    "ldr s22, [x28, x26]\n"
+    "fmla v12.4s, v16.4s, v4.4s\n"
+    "ldr s21, [x21, x25]\n"
+    "fmla v11.4s, v15.4s, v3.4s\n"
+    "ldr s23, [x22, x23]\n"
+    "fmla v5.4s, v16.4s, v9.4s\n"
+    "ldr s18, [x27, x26]\n"
+    "fmla v10.4s, v16.4s, v0.4s\n"
+    "ldr s15, [x28, x25]\n"
+    "fmla v11.4s, v16.4s, v2.4s\n"
+    "ldr s16, [x22, x26]\n"
+    "fmla v12.4s, v17.4s, v3.4s\n"
+    "ldr s17, [x27, x25]\n"
+    "fmla v10.4s, v19.4s, v6.4s\n"
+    "ldr s13, [x22, x25]\n"
+    "fmla v11.4s, v20.4s, v8.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v12.4s, v14.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v10.4s, v22.4s, v1.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v11.4s, v19.4s, v7.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v12.4s, v22.4s, v8.4s\n"
+    "add x28, x28, #4\n"
+    "fmla v10.4s, v23.4s, v4.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v11.4s, v23.4s, v9.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v12.4s, v21.4s, v7.4s\n"
+    "movi v20.16b, #0\n"
+    "fmla v10.4s, v18.4s, v3.4s\n"
+    "fmov v22.4s, #6.0\n"
+    "fmax v5.4s, v5.4s, v20.4s\n"
+    "fmax v11.4s, v11.4s, v20.4s\n"
+    "fmla v12.4s, v15.4s, v9.4s\n"
+    "fmla v10.4s, v15.4s, v2.4s\n"
+    "fmin v5.4s, v5.4s, v22.4s\n"
+    "fmin v11.4s, v11.4s, v22.4s\n"
+    "fmax v12.4s, v12.4s, v20.4s\n"
+    "str s5, [%[outptr0]]\n"
+    "str s11, [x24]\n"
+    "fmla v10.4s, v16.4s, v8.4s\n"
+    "fmin v12.4s, v12.4s, v22.4s\n"
+    "str s12, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v17.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v10.4s, v13.4s, v9.4s\n"
+    "fmax v10.4s, v10.4s, v20.4s\n"
+    "fmin v10.4s, v10.4s, v22.4s\n"
+    "str s10, [x24, %[output_col_stride1]]\n"
+    "add x24, x24, #4\n"
+    "7:\n"
+    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+    : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
 template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-  },
-};
-
 template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  __asm __volatile(
+    "mov x27, xzr\n"
+    "mov x28, xzr\n"
+    "and x26, %[n_channels], #3\n"
+    "lsr x25, %[n_channels], #2\n"
+    "cbz x25, 4f\n"
+    "1:\n"
+    "ldr q15, [%[wbptr]]\n"
+    "ldr x21, [%[inptrs], 0]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr q14, [%[wbptr], #16]\n"
+    "mov v3.16b, v15.16b\n"
+    "ldr q10, [%[wbptr], #32]\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr q7, [%[wbptr], #48]\n"
+    "mov v4.16b, v15.16b\n"
+    "ldr q13, [%[wbptr], #64]\n"
+    "ldr q5, [%[wbptr], #80]\n"
+    "ldr x22, [%[inptrs], 40]\n"
+    "ldr q0, [%[wbptr], #96]\n"
+    "ldr x20, [%[inptrs], 80]\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "ldr x23, [%[inptrs], 120]\n"
+    "ldr q6, [%[wbptr], #128]\n"
+    "subs x25, x25, #1\n"
+    "ldr q1, [%[wbptr], #144]\n"
+    "ldr q17, [x21, x27]\n"
+    "fmla v8.4s, v17.4s, v14.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "ldr q16, [x20, x27]\n"
+    "ldr x21, [%[inptrs], 8]\n"
+    "ldr q17, [x23, x27]\n"
+    "ldr x22, [%[inptrs], 48]\n"
+    "ldr q11, [x21, x27]\n"
+    "ldr x20, [%[inptrs], 88]\n"
+    "fmla v8.4s, v18.4s, v13.4s\n"
+    "ldr q19, [x22, x27]\n"
+    "ldr q15, [x20, x27]\n"
+    "ldr x21, [%[inptrs], 16]\n"
+    "ldr q12, [x21, x27]\n"
+    "fmla v8.4s, v11.4s, v10.4s\n"
+    "fmla v8.4s, v16.4s, v9.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v3.4s, v16.4s, v14.4s\n"
+    "ldr x22, [%[inptrs], 56]\n"
+    "fmla v8.4s, v19.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 24]\n"
+    "fmla v2.4s, v12.4s, v14.4s\n"
+    "ldr q16, [x22, x27]\n"
+    "movi v11.16b, #0\n"
+    "ldr q18, [x21, x27]\n"
+    "fmla v3.4s, v17.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v8.4s, v12.4s, v7.4s\n"
+    "ldr x23, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v13.4s\n"
+    "ldr q19, [x20, x27]\n"
+    "fmov v12.4s, #6.0\n"
+    "ldr q17, [x23, x27]\n"
+    "fmla v3.4s, v15.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 96]\n"
+    "fmla v8.4s, v15.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 64]\n"
+    "fmla v2.4s, v18.4s, v10.4s\n"
+    "ldr q15, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v14.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "ldr x21, [%[inptrs], 32]\n"
+    "fmla v8.4s, v16.4s, v0.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v2.4s, v15.4s, v9.4s\n"
+    "ldr q19, [x21, x27]\n"
+    "ldr q16, [x20, x27]\n"
+    "ldr x23, [%[inptrs], 136]\n"
+    "fmla v3.4s, v17.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 104]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr q14, [x23, x27]\n"
+    "fmla v2.4s, v18.4s, v5.4s\n"
+    "ldr q17, [x20, x27]\n"
+    "fmla v4.4s, v14.4s, v13.4s\n"
+    "ldr x22, [%[inptrs], 72]\n"
+    "fmla v3.4s, v15.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmax v8.4s, v8.4s, v11.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "fmla v2.4s, v19.4s, v7.4s\n"
+    "ldr q13, [x20, x27]\n"
+    "fmla v4.4s, v17.4s, v10.4s\n"
+    "ldr x23, [%[inptrs], 144]\n"
+    "fmla v3.4s, v16.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 112]\n"
+    "fmin v8.4s, v8.4s, v12.4s\n"
+    "ldr q10, [x23, x27]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr q15, [x20, x27]\n"
+    "fmla v4.4s, v13.4s, v9.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v3.4s, v14.4s, v0.4s\n"
+    "ldr x23, [%[inptrs], 152]\n"
+    "ldr q9, [x20, x27]\n"
+    "ldr x22, [%[outptrs], 0]\n"
+    "fmla v2.4s, v18.4s, v0.4s\n"
+    "ldr q19, [x23, x27]\n"
+    "str q8, [x22, x28]\n"
+    "fmla v4.4s, v10.4s, v5.4s\n"
+    "fmla v3.4s, v13.4s, v1.4s\n"
+    "ldr x20, [%[inptrs], 192]\n"
+    "ldr x22, [%[outptrs], 8]\n"
+    "ldr x24, [%[outptrs], 16]\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v15.4s, v1.4s\n"
+    "ldr q16, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v7.4s\n"
+    "ldr q15, [%[wbptr]]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "ldr q14, [%[wbptr], #16]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr q10, [%[wbptr], #32]\n"
+    "fmax v2.4s, v2.4s, v11.4s\n"
+    "ldr q13, [%[wbptr], #64]\n"
+    "fmla v4.4s, v9.4s, v6.4s\n"
+    "ldr q7, [%[wbptr], #48]\n"
+    "fmin v3.4s, v3.4s, v12.4s\n"
+    "ldr q5, [%[wbptr], #80]\n"
+    "fmin v2.4s, v2.4s, v12.4s\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "add x27, x27, #16\n"
+    "str q3, [x24, x28]\n"
+    "fmla v4.4s, v19.4s, v0.4s\n"
+    "str q2, [x22, x28]\n"
+    "mov v3.16b, v15.16b\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr q6, [%[wbptr], #128]\n"
+    "ldr x24, [%[outptrs], 24]\n"
+    "ldr x21, [%[inptrs], 0]\n"
+    "ldr x22, [%[inptrs], 40]\n"
+    "fmla v4.4s, v16.4s, v1.4s\n"
+    "ldr q0, [%[wbptr], #96]\n"
+    "ldr q17, [x21, x27]\n"
+    "ldr x20, [%[inptrs], 80]\n"
+    "fmla v8.4s, v17.4s, v14.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "ldr q16, [x20, x27]\n"
+    "ldr x21, [%[inptrs], 8]\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "ldr q1, [%[wbptr], #144]\n"
+    "ldr q11, [x21, x27]\n"
+    "ldr x22, [%[inptrs], 48]\n"
+    "fmla v8.4s, v18.4s, v13.4s\n"
+    "ldr x21, [%[inptrs], 16]\n"
+    "fmin v4.4s, v4.4s, v12.4s\n"
+    "ldr q19, [x22, x27]\n"
+    "ldr q12, [x21, x27]\n"
+    "ldr x23, [%[inptrs], 120]\n"
+    "ldr x20, [%[inptrs], 88]\n"
+    "subs x25, x25, #1\n"
+    "str q4, [x24, x28]\n"
+    "mov v4.16b, v15.16b\n"
+    "ldr q17, [x23, x27]\n"
+    "fmla v8.4s, v11.4s, v10.4s\n"
+    "ldr q15, [x20, x27]\n"
+    "add x28, x28, #16\n"
+    "fmla v8.4s, v16.4s, v9.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v3.4s, v16.4s, v14.4s\n"
+    "ldr x22, [%[inptrs], 56]\n"
+    "fmla v8.4s, v19.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 24]\n"
+    "fmla v2.4s, v12.4s, v14.4s\n"
+    "ldr q16, [x22, x27]\n"
+    "movi v11.16b, #0\n"
+    "ldr q18, [x21, x27]\n"
+    "fmla v3.4s, v17.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v8.4s, v12.4s, v7.4s\n"
+    "ldr x23, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v13.4s\n"
+    "ldr q19, [x20, x27]\n"
+    "fmov v12.4s, #6.0\n"
+    "ldr q17, [x23, x27]\n"
+    "fmla v3.4s, v15.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 96]\n"
+    "fmla v8.4s, v15.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 64]\n"
+    "fmla v2.4s, v18.4s, v10.4s\n"
+    "ldr q15, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v14.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "ldr x21, [%[inptrs], 32]\n"
+    "fmla v8.4s, v16.4s, v0.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v2.4s, v15.4s, v9.4s\n"
+    "ldr q19, [x21, x27]\n"
+    "ldr q16, [x20, x27]\n"
+    "ldr x23, [%[inptrs], 136]\n"
+    "fmla v3.4s, v17.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 104]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr q14, [x23, x27]\n"
+    "fmla v2.4s, v18.4s, v5.4s\n"
+    "ldr q17, [x20, x27]\n"
+    "fmla v4.4s, v14.4s, v13.4s\n"
+    "ldr x22, [%[inptrs], 72]\n"
+    "fmla v3.4s, v15.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmax v8.4s, v8.4s, v11.4s\n"
+    "ldr q18, [x22, x27]\n"
+    "fmla v2.4s, v19.4s, v7.4s\n"
+    "ldr q13, [x20, x27]\n"
+    "fmla v4.4s, v17.4s, v10.4s\n"
+    "ldr x23, [%[inptrs], 144]\n"
+    "fmla v3.4s, v16.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 112]\n"
+    "fmin v8.4s, v8.4s, v12.4s\n"
+    "ldr q10, [x23, x27]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr q15, [x20, x27]\n"
+    "fmla v4.4s, v13.4s, v9.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v3.4s, v14.4s, v0.4s\n"
+    "ldr x23, [%[inptrs], 152]\n"
+    "ldr q9, [x20, x27]\n"
+    "ldr x22, [%[outptrs], 0]\n"
+    "fmla v2.4s, v18.4s, v0.4s\n"
+    "ldr q19, [x23, x27]\n"
+    "str q8, [x22, x28]\n"
+    "fmla v4.4s, v10.4s, v5.4s\n"
+    "fmla v3.4s, v13.4s, v1.4s\n"
+    "ldr x20, [%[inptrs], 192]\n"
+    "ldr x22, [%[outptrs], 8]\n"
+    "ldr x24, [%[outptrs], 16]\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v15.4s, v1.4s\n"
+    "ldr q16, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v7.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "add x27, x27, #16\n"
+    "fmax v2.4s, v2.4s, v11.4s\n"
+    "fmla v4.4s, v9.4s, v6.4s\n"
+    "fmin v3.4s, v3.4s, v12.4s\n"
+    "fmin v2.4s, v2.4s, v12.4s\n"
+    "str q3, [x24, x28]\n"
+    "fmla v4.4s, v19.4s, v0.4s\n"
+    "str q2, [x22, x28]\n"
+    "ldr x24, [%[outptrs], 24]\n"
+    "fmla v4.4s, v16.4s, v1.4s\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "fmin v4.4s, v4.4s, v12.4s\n"
+    "str q4, [x24, x28]\n"
+    "add x28, x28, #16\n"
+    "4:\n"
+    "cbz x26, 7f\n"
+    "ldr s15, [%[wbptr]]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr s14, [%[wbptr], #4]\n"
+    "mov v3.16b, v15.16b\n"
+    "ldr s10, [%[wbptr], #8]\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr s7, [%[wbptr], #12]\n"
+    "mov v4.16b, v15.16b\n"
+    "ldr s13, [%[wbptr], #16]\n"
+    "ldr s5, [%[wbptr], #20]\n"
+    "ldr x21, [%[inptrs], 0]\n"
+    "ldr s0, [%[wbptr], #24]\n"
+    "ldr x22, [%[inptrs], 40]\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "ldr x20, [%[inptrs], 80]\n"
+    "ldr s6, [%[wbptr], #32]\n"
+    "ldr x23, [%[inptrs], 120]\n"
+    "ldr s1, [%[wbptr], #36]\n"
+    "subs x26, x26, #1\n"
+    "ldr s17, [x21, x27]\n"
+    "ldr s18, [x22, x27]\n"
+    "fmla v8.4s, v17.4s, v14.4s\n"
+    "ldr s16, [x20, x27]\n"
+    "ldr s17, [x23, x27]\n"
+    "ldr x21, [%[inptrs], 8]\n"
+    "ldr x22, [%[inptrs], 48]\n"
+    "ldr x20, [%[inptrs], 88]\n"
+    "ldr s11, [x21, x27]\n"
+    "fmla v8.4s, v18.4s, v13.4s\n"
+    "ldr s19, [x22, x27]\n"
+    "ldr s15, [x20, x27]\n"
+    "ldr x21, [%[inptrs], 16]\n"
+    "ldr s12, [x21, x27]\n"
+    "fmla v8.4s, v11.4s, v10.4s\n"
+    "fmla v8.4s, v16.4s, v9.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v3.4s, v16.4s, v14.4s\n"
+    "ldr x22, [%[inptrs], 56]\n"
+    "fmla v8.4s, v19.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 24]\n"
+    "fmla v2.4s, v12.4s, v14.4s\n"
+    "ldr s16, [x22, x27]\n"
+    "movi v11.16b, #0\n"
+    "ldr s18, [x21, x27]\n"
+    "fmla v3.4s, v17.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v8.4s, v12.4s, v7.4s\n"
+    "ldr x23, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v13.4s\n"
+    "ldr s19, [x20, x27]\n"
+    "fmov v12.4s, #6.0\n"
+    "ldr s17, [x23, x27]\n"
+    "fmla v3.4s, v15.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 96]\n"
+    "fmla v8.4s, v15.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 64]\n"
+    "fmla v2.4s, v18.4s, v10.4s\n"
+    "ldr s15, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v14.4s\n"
+    "ldr s18, [x22, x27]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "ldr x21, [%[inptrs], 32]\n"
+    "fmla v8.4s, v16.4s, v0.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v2.4s, v15.4s, v9.4s\n"
+    "ldr s19, [x21, x27]\n"
+    "ldr s16, [x20, x27]\n"
+    "ldr x23, [%[inptrs], 136]\n"
+    "fmla v3.4s, v17.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 104]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr s14, [x23, x27]\n"
+    "fmla v2.4s, v18.4s, v5.4s\n"
+    "ldr s17, [x20, x27]\n"
+    "fmla v4.4s, v14.4s, v13.4s\n"
+    "ldr x22, [%[inptrs], 72]\n"
+    "fmla v3.4s, v15.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmax v8.4s, v8.4s, v11.4s\n"
+    "ldr s18, [x22, x27]\n"
+    "fmla v2.4s, v19.4s, v7.4s\n"
+    "ldr s13, [x20, x27]\n"
+    "fmla v4.4s, v17.4s, v10.4s\n"
+    "ldr x23, [%[inptrs], 144]\n"
+    "fmla v3.4s, v16.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 112]\n"
+    "fmin v8.4s, v8.4s, v12.4s\n"
+    "ldr s10, [x23, x27]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr s15, [x20, x27]\n"
+    "fmla v4.4s, v13.4s, v9.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v3.4s, v14.4s, v0.4s\n"
+    "ldr x23, [%[inptrs], 152]\n"
+    "ldr s9, [x20, x27]\n"
+    "ldr x22, [%[outptrs], 0]\n"
+    "fmla v2.4s, v18.4s, v0.4s\n"
+    "ldr s19, [x23, x27]\n"
+    "str s8, [x22, x28]\n"
+    "fmla v4.4s, v10.4s, v5.4s\n"
+    "fmla v3.4s, v13.4s, v1.4s\n"
+    "ldr x20, [%[inptrs], 192]\n"
+    "ldr x22, [%[outptrs], 8]\n"
+    "ldr x24, [%[outptrs], 16]\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v15.4s, v1.4s\n"
+    "ldr s16, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v7.4s\n"
+    "ldr s15, [%[wbptr]]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "ldr s14, [%[wbptr], #4]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr s10, [%[wbptr], #8]\n"
+    "fmax v2.4s, v2.4s, v11.4s\n"
+    "ldr s13, [%[wbptr], #16]\n"
+    "fmla v4.4s, v9.4s, v6.4s\n"
+    "ldr s7, [%[wbptr], #12]\n"
+    "fmin v3.4s, v3.4s, v12.4s\n"
+    "ldr s5, [%[wbptr], #20]\n"
+    "fmin v2.4s, v2.4s, v12.4s\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "add x27, x27, #4\n"
+    "str s3, [x24, x28]\n"
+    "fmla v4.4s, v19.4s, v0.4s\n"
+    "str s2, [x22, x28]\n"
+    "mov v3.16b, v15.16b\n"
+    "mov v2.16b, v15.16b\n"
+    "ldr s6, [%[wbptr], #32]\n"
+    "ldr x24, [%[outptrs], 24]\n"
+    "ldr x21, [%[inptrs], 0]\n"
+    "ldr x22, [%[inptrs], 40]\n"
+    "fmla v4.4s, v16.4s, v1.4s\n"
+    "ldr s0, [%[wbptr], #24]\n"
+    "ldr s17, [x21, x27]\n"
+    "ldr x20, [%[inptrs], 80]\n"
+    "fmla v8.4s, v17.4s, v14.4s\n"
+    "ldr s18, [x22, x27]\n"
+    "ldr s16, [x20, x27]\n"
+    "ldr x21, [%[inptrs], 8]\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "ldr s1, [%[wbptr], #36]\n"
+    "ldr s11, [x21, x27]\n"
+    "ldr x22, [%[inptrs], 48]\n"
+    "fmla v8.4s, v18.4s, v13.4s\n"
+    "ldr x21, [%[inptrs], 16]\n"
+    "fmin v4.4s, v4.4s, v12.4s\n"
+    "ldr s19, [x22, x27]\n"
+    "ldr s12, [x21, x27]\n"
+    "ldr x23, [%[inptrs], 120]\n"
+    "ldr x20, [%[inptrs], 88]\n"
+    "subs x26, x26, #1\n"
+    "str s4, [x24, x28]\n"
+    "mov v4.16b, v15.16b\n"
+    "ldr s17, [x23, x27]\n"
+    "fmla v8.4s, v11.4s, v10.4s\n"
+    "ldr s15, [x20, x27]\n"
+    "add x28, x28, #4\n"
+    "fmla v8.4s, v16.4s, v9.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v3.4s, v16.4s, v14.4s\n"
+    "ldr x22, [%[inptrs], 56]\n"
+    "fmla v8.4s, v19.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 24]\n"
+    "fmla v2.4s, v12.4s, v14.4s\n"
+    "ldr s16, [x22, x27]\n"
+    "movi v11.16b, #0\n"
+    "ldr s18, [x21, x27]\n"
+    "fmla v3.4s, v17.4s, v13.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v8.4s, v12.4s, v7.4s\n"
+    "ldr x23, [%[inptrs], 128]\n"
+    "fmla v2.4s, v16.4s, v13.4s\n"
+    "ldr s19, [x20, x27]\n"
+    "fmov v12.4s, #6.0\n"
+    "ldr s17, [x23, x27]\n"
+    "fmla v3.4s, v15.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 96]\n"
+    "fmla v8.4s, v15.4s, v6.4s\n"
+    "ldr x22, [%[inptrs], 64]\n"
+    "fmla v2.4s, v18.4s, v10.4s\n"
+    "ldr s15, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v14.4s\n"
+    "ldr s18, [x22, x27]\n"
+    "fmla v3.4s, v19.4s, v9.4s\n"
+    "ldr x21, [%[inptrs], 32]\n"
+    "fmla v8.4s, v16.4s, v0.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v2.4s, v15.4s, v9.4s\n"
+    "ldr s19, [x21, x27]\n"
+    "ldr s16, [x20, x27]\n"
+    "ldr x23, [%[inptrs], 136]\n"
+    "fmla v3.4s, v17.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 104]\n"
+    "fmla v8.4s, v15.4s, v1.4s\n"
+    "ldr s14, [x23, x27]\n"
+    "fmla v2.4s, v18.4s, v5.4s\n"
+    "ldr s17, [x20, x27]\n"
+    "fmla v4.4s, v14.4s, v13.4s\n"
+    "ldr x22, [%[inptrs], 72]\n"
+    "fmla v3.4s, v15.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmax v8.4s, v8.4s, v11.4s\n"
+    "ldr s18, [x22, x27]\n"
+    "fmla v2.4s, v19.4s, v7.4s\n"
+    "ldr s13, [x20, x27]\n"
+    "fmla v4.4s, v17.4s, v10.4s\n"
+    "ldr x23, [%[inptrs], 144]\n"
+    "fmla v3.4s, v16.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 112]\n"
+    "fmin v8.4s, v8.4s, v12.4s\n"
+    "ldr s10, [x23, x27]\n"
+    "fmla v2.4s, v17.4s, v6.4s\n"
+    "ldr s15, [x20, x27]\n"
+    "fmla v4.4s, v13.4s, v9.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v3.4s, v14.4s, v0.4s\n"
+    "ldr x23, [%[inptrs], 152]\n"
+    "ldr s9, [x20, x27]\n"
+    "ldr x22, [%[outptrs], 0]\n"
+    "fmla v2.4s, v18.4s, v0.4s\n"
+    "ldr s19, [x23, x27]\n"
+    "str s8, [x22, x28]\n"
+    "fmla v4.4s, v10.4s, v5.4s\n"
+    "fmla v3.4s, v13.4s, v1.4s\n"
+    "ldr x20, [%[inptrs], 192]\n"
+    "ldr x22, [%[outptrs], 8]\n"
+    "ldr x24, [%[outptrs], 16]\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v15.4s, v1.4s\n"
+    "ldr s16, [x20, x27]\n"
+    "fmla v4.4s, v15.4s, v7.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "add x27, x27, #4\n"
+    "fmax v2.4s, v2.4s, v11.4s\n"
+    "fmla v4.4s, v9.4s, v6.4s\n"
+    "fmin v3.4s, v3.4s, v12.4s\n"
+    "fmin v2.4s, v2.4s, v12.4s\n"
+    "str s3, [x24, x28]\n"
+    "fmla v4.4s, v19.4s, v0.4s\n"
+    "str s2, [x22, x28]\n"
+    "ldr x24, [%[outptrs], 24]\n"
+    "fmla v4.4s, v16.4s, v1.4s\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "fmin v4.4s, v4.4s, v12.4s\n"
+    "str s4, [x24, x28]\n"
+    "add x28, x28, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr)
+    : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
+#endif  // __aarch64__
+
+template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp
new file mode 100644
index 0000000..23a99a8
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
+}  // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 21e8f04..2508ec7 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,931 +25,2317 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
 
 #ifdef __aarch64__
-
 template <>
 template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
-  const int n_channels,
-  const float* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
 )
 {
-  // Copy pointers
-  const float *uptr0 = inptr;
-  const float *wptr0 = weights;
-  float *vptr0 = outptr;
+  __asm __volatile(
+    "add x20, %[inptr0], %[input_row_stride]\n"
+    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x24, %[outptr0], %[output_row_stride]\n"
+    "add x21, x20, %[input_row_stride]\n"
+    "add x14, x13, #64\n"
+    "add x15, x13, %[input_col_stride1]\n"
+    "add x22, x21, %[input_row_stride]\n"
+    "add x16, x15, #64\n"
+    "add x17, x15, %[input_col_stride1]\n"
+    "add x23, x22, %[input_row_stride]\n"
+    "add x18, x17, #64\n"
+    "add x25, x24, %[output_row_stride]\n"
+    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x27, %[n_channels], #3\n"
+    "lsr x28, %[n_channels], #2\n"
+    "cbz x28, 4f\n"
+    "1:\n"
+    "ldr q25, [%[wbptr]]\n"
+    "subs x28, x28, #1\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr q16, [%[wbptr], #16]\n"
+    "mov v13.16b, v25.16b\n"
+    "ldr q7, [%[wbptr], #32]\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr q6, [%[wbptr], #48]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "mov v12.16b, v25.16b\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "mov v14.16b, v25.16b\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "mov v9.16b, v25.16b\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "mov v11.16b, v25.16b\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "mov v8.16b, v25.16b\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "ldr q26, [%[inptr0]]\n"
+    "ldr q28, [x20]\n"
+    "fmla v17.4s, v26.4s, v16.4s\n"
+    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v13.4s, v28.4s, v16.4s\n"
+    "ldr q27, [x21]\n"
+    "fmla v15.4s, v29.4s, v16.4s\n"
+    "ldr q21, [x20, %[input_col_stride1]]\n"
+    "fmla v17.4s, v28.4s, v5.4s\n"
+    "ldr q20, [%[inptr0], x13]\n"
+    "ldr q23, [x22]\n"
+    "ldr q19, [x21, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "fmla v17.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [%[inptr0], x19]\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "prfm pldl1keep, [x20, x19]\n"
+    "prfm pldl1keep, [%[inptr0], x14]\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "prfm pldl1keep, [x21, x19]\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "ldr q30, [x20, x13]\n"
+    "fmla v13.4s, v27.4s, v5.4s\n"
+    "ldr q29, [%[inptr0], x15]\n"
+    "fmla v10.4s, v27.4s, v16.4s\n"
+    "ldr q28, [x23]\n"
+    "fmla v17.4s, v21.4s, v4.4s\n"
+    "ldr q24, [x22, %[input_col_stride1]]\n"
+    "fmla v13.4s, v21.4s, v7.4s\n"
+    "ldr q18, [x21, x13]\n"
+    "fmla v15.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla v12.4s, v21.4s, v16.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v17.4s, v20.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v15.4s, v20.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v14.4s, v20.4s, v16.4s\n"
+    "ldr q25, [%[inptr0], x17]\n"
+    "fmla v13.4s, v23.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x19]\n"
+    "fmla v10.4s, v23.4s, v5.4s\n"
+    "ldr q26, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v19.4s, v1.4s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla v13.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v15.4s, v19.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v10.4s, v19.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, x19]\n"
+    "fmla v12.4s, v19.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla v9.4s, v19.4s, v16.4s\n"
+    "ldr q27, [x22, x13]\n"
+    "fmla v17.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v13.4s, v30.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v15.4s, v30.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla v12.4s, v30.4s, v7.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x21, x18]\n"
+    "fmla v11.4s, v30.4s, v16.4s\n"
+    "ldr q21, [x21, x15]\n"
+    "fmla v15.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr q20, [x20, x17]\n"
+    "fmla v10.4s, v28.4s, v2.4s\n"
+    "ldr q19, [x23, x13]\n"
+    "fmla v13.4s, v24.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v12.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v10.4s, v24.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v9.4s, v24.4s, v5.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v17.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v13.4s, v18.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v15.4s, v18.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "str q17, [%[outptr0]]\n"
+    "fmla v10.4s, v18.4s, v6.4s\n"
+    "fmla v12.4s, v18.4s, v4.4s\n"
+    "ldr q17, [x21, x17]\n"
+    "fmla v14.4s, v18.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x19]\n"
+    "fmla v9.4s, v18.4s, v7.4s\n"
+    "prfm pldl1keep, [%[inptr0], x14]\n"
+    "fmla v11.4s, v18.4s, v5.4s\n"
+    "add x20, x20, #16\n"
+    "fmla v8.4s, v18.4s, v16.4s\n"
+    "ldr q24, [x23, x15]\n"
+    "fmla v15.4s, v22.4s, v3.4s\n"
+    "ldr q18, [x22, x17]\n"
+    "fmla v12.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "fmla v14.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x19]\n"
+    "fmla v11.4s, v22.4s, v7.4s\n"
+    "ldr q22, [x23, x17]\n"
+    "fmla v10.4s, v26.4s, v1.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v14.4s, v25.4s, v6.4s\n"
+    "ldr q25, [%[wbptr]]\n"
+    "fmla v9.4s, v26.4s, v2.4s\n"
+    "ldr q16, [%[wbptr], #16]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "fmla v10.4s, v27.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x19]\n"
+    "fmla v12.4s, v27.4s, v1.4s\n"
+    "add x22, x22, #16\n"
+    "str q13, [x24]\n"
+    "fmla v9.4s, v27.4s, v4.4s\n"
+    "fmla v11.4s, v27.4s, v2.4s\n"
+    "ldr q26, [%[inptr0]]\n"
+    "fmla v8.4s, v27.4s, v5.4s\n"
+    "ldr q28, [x20]\n"
+    "fmla v15.4s, v21.4s, v0.4s\n"
+    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v14.4s, v21.4s, v1.4s\n"
+    "add x23, x23, #16\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "ldr q5, [%[wbptr], #64]\n"
+    "fmla v8.4s, v21.4s, v7.4s\n"
+    "ldr q27, [x21]\n"
+    "fmla v14.4s, v20.4s, v3.4s\n"
+    "ldr q21, [x20, %[input_col_stride1]]\n"
+    "fmla v11.4s, v20.4s, v6.4s\n"
+    "ldr q20, [%[inptr0], x13]\n"
+    "fmla v10.4s, v19.4s, v0.4s\n"
+    "subs x28, x28, #1\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v8.4s, v19.4s, v2.4s\n"
+    "fmla v12.4s, v23.4s, v0.4s\n"
+    "ldr q7, [%[wbptr], #32]\n"
+    "str q10, [x25]\n"
+    "fmla v11.4s, v23.4s, v1.4s\n"
+    "fmla v9.4s, v23.4s, v3.4s\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "str q12, [x24, %[output_col_stride1]]\n"
+    "fmla v8.4s, v23.4s, v4.4s\n"
+    "fmla v14.4s, v17.4s, v0.4s\n"
+    "ldr q23, [x22]\n"
+    "fmla v11.4s, v17.4s, v3.4s\n"
+    "ldr q19, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v17.4s, v6.4s\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "str q14, [%[outptr0], x26]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "fmla v11.4s, v18.4s, v0.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v8.4s, v24.4s, v1.4s\n"
+    "ldr q6, [%[wbptr], #48]\n"
+    "str q9, [x25, %[output_col_stride1]]\n"
+    "mov v17.16b, v25.16b\n"
+    "str q11, [x24, x26]\n"
+    "mov v13.16b, v25.16b\n"
+    "fmla v8.4s, v18.4s, v3.4s\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "mov v15.16b, v25.16b\n"
+    "add x24, x24, #16\n"
+    "mov v10.16b, v25.16b\n"
+    "mov v12.16b, v25.16b\n"
+    "fmla v8.4s, v22.4s, v0.4s\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "mov v14.16b, v25.16b\n"
+    "mov v9.16b, v25.16b\n"
+    "mov v11.16b, v25.16b\n"
+    "fmla v17.4s, v26.4s, v16.4s\n"
+    "str q8, [x25, x26]\n"
+    "fmla v13.4s, v28.4s, v16.4s\n"
+    "mov v8.16b, v25.16b\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "fmla v17.4s, v28.4s, v5.4s\n"
+    "fmla v15.4s, v29.4s, v16.4s\n"
+    "add x25, x25, #16\n"
+    "fmla v17.4s, v29.4s, v7.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "ldr q30, [x20, x13]\n"
+    "fmla v13.4s, v27.4s, v5.4s\n"
+    "ldr q29, [%[inptr0], x15]\n"
+    "fmla v10.4s, v27.4s, v16.4s\n"
+    "ldr q28, [x23]\n"
+    "fmla v17.4s, v21.4s, v4.4s\n"
+    "ldr q24, [x22, %[input_col_stride1]]\n"
+    "fmla v13.4s, v21.4s, v7.4s\n"
+    "ldr q18, [x21, x13]\n"
+    "fmla v15.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla v12.4s, v21.4s, v16.4s\n"
+    "ldr q22, [x20, x15]\n"
+    "fmla v17.4s, v20.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v15.4s, v20.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v14.4s, v20.4s, v16.4s\n"
+    "ldr q25, [%[inptr0], x17]\n"
+    "fmla v13.4s, v23.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x19]\n"
+    "fmla v10.4s, v23.4s, v5.4s\n"
+    "ldr q26, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v19.4s, v1.4s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla v13.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v15.4s, v19.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v10.4s, v19.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, x19]\n"
+    "fmla v12.4s, v19.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla v9.4s, v19.4s, v16.4s\n"
+    "ldr q27, [x22, x13]\n"
+    "fmla v17.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v13.4s, v30.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v15.4s, v30.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla v12.4s, v30.4s, v7.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x21, x18]\n"
+    "fmla v11.4s, v30.4s, v16.4s\n"
+    "ldr q21, [x21, x15]\n"
+    "fmla v15.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr q20, [x20, x17]\n"
+    "fmla v10.4s, v28.4s, v2.4s\n"
+    "ldr q19, [x23, x13]\n"
+    "fmla v13.4s, v24.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v12.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v10.4s, v24.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v9.4s, v24.4s, v5.4s\n"
+    "ldr q23, [x22, x15]\n"
+    "fmla v17.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v13.4s, v18.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v15.4s, v18.4s, v1.4s\n"
+    "add x20, x20, #16\n"
+    "str q17, [%[outptr0]]\n"
+    "fmla v10.4s, v18.4s, v6.4s\n"
+    "fmla v12.4s, v18.4s, v4.4s\n"
+    "ldr q17, [x21, x17]\n"
+    "fmla v14.4s, v18.4s, v2.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v9.4s, v18.4s, v7.4s\n"
+    "fmla v11.4s, v18.4s, v5.4s\n"
+    "fmla v8.4s, v18.4s, v16.4s\n"
+    "ldr q24, [x23, x15]\n"
+    "fmla v15.4s, v22.4s, v3.4s\n"
+    "ldr q18, [x22, x17]\n"
+    "fmla v12.4s, v22.4s, v6.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v14.4s, v22.4s, v4.4s\n"
+    "fmla v11.4s, v22.4s, v7.4s\n"
+    "fmla v10.4s, v26.4s, v1.4s\n"
+    "ldr q22, [x23, x17]\n"
+    "fmla v9.4s, v26.4s, v2.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v14.4s, v25.4s, v6.4s\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "fmla v10.4s, v27.4s, v3.4s\n"
+    "fmla v12.4s, v27.4s, v1.4s\n"
+    "fmla v9.4s, v27.4s, v4.4s\n"
+    "fmla v11.4s, v27.4s, v2.4s\n"
+    "str q13, [x24]\n"
+    "fmla v8.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v21.4s, v0.4s\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "fmla v14.4s, v21.4s, v1.4s\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "fmla v8.4s, v21.4s, v7.4s\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v19.4s, v0.4s\n"
+    "fmla v14.4s, v20.4s, v3.4s\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v11.4s, v20.4s, v6.4s\n"
+    "fmla v8.4s, v19.4s, v2.4s\n"
+    "str q10, [x25]\n"
+    "fmla v12.4s, v23.4s, v0.4s\n"
+    "fmla v9.4s, v23.4s, v3.4s\n"
+    "fmla v14.4s, v17.4s, v0.4s\n"
+    "fmla v11.4s, v23.4s, v1.4s\n"
+    "fmla v8.4s, v23.4s, v4.4s\n"
+    "str q12, [x24, %[output_col_stride1]]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "str q14, [%[outptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v3.4s\n"
+    "fmla v8.4s, v17.4s, v6.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "str q9, [x25, %[output_col_stride1]]\n"
+    "fmla v11.4s, v18.4s, v0.4s\n"
+    "fmla v8.4s, v24.4s, v1.4s\n"
+    "str q11, [x24, x26]\n"
+    "fmla v8.4s, v18.4s, v3.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v8.4s, v22.4s, v0.4s\n"
+    "str q8, [x25, x26]\n"
+    "add x25, x25, #16\n"
+    "4:\n"
+    "cbz x27, 7f\n"
+    "ldr s25, [%[wbptr]]\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr s16, [%[wbptr], #4]\n"
+    "mov v13.16b, v25.16b\n"
+    "ldr s7, [%[wbptr], #8]\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr s6, [%[wbptr], #12]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "mov v12.16b, v25.16b\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "mov v14.16b, v25.16b\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "mov v9.16b, v25.16b\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "mov v11.16b, v25.16b\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "mov v8.16b, v25.16b\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "ldr s26, [%[inptr0]]\n"
+    "subs x27, x27, #1\n"
+    "fmla v17.4s, v26.4s, v16.4s\n"
+    "ldr s28, [x20]\n"
+    "fmla v13.4s, v28.4s, v16.4s\n"
+    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v15.4s, v29.4s, v16.4s\n"
+    "ldr s27, [x21]\n"
+    "fmla v17.4s, v28.4s, v5.4s\n"
+    "ldr s21, [x20, %[input_col_stride1]]\n"
+    "ldr s20, [%[inptr0], x13]\n"
+    "ldr s23, [x22]\n"
+    "ldr s19, [x21, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v17.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x19]\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "prfm pldl1keep, [x20, x19]\n"
+    "prfm pldl1keep, [%[inptr0], x14]\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "prfm pldl1keep, [x21, x19]\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "ldr s30, [x20, x13]\n"
+    "fmla v13.4s, v27.4s, v5.4s\n"
+    "ldr s29, [%[inptr0], x15]\n"
+    "fmla v10.4s, v27.4s, v16.4s\n"
+    "ldr s28, [x23]\n"
+    "fmla v17.4s, v21.4s, v4.4s\n"
+    "ldr s24, [x22, %[input_col_stride1]]\n"
+    "fmla v13.4s, v21.4s, v7.4s\n"
+    "ldr s18, [x21, x13]\n"
+    "fmla v15.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla v12.4s, v21.4s, v16.4s\n"
+    "ldr s22, [x20, x15]\n"
+    "fmla v17.4s, v20.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v15.4s, v20.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v14.4s, v20.4s, v16.4s\n"
+    "ldr s25, [%[inptr0], x17]\n"
+    "fmla v13.4s, v23.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x19]\n"
+    "fmla v10.4s, v23.4s, v5.4s\n"
+    "ldr s26, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v19.4s, v1.4s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla v13.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v15.4s, v19.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v10.4s, v19.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, x19]\n"
+    "fmla v12.4s, v19.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla v9.4s, v19.4s, v16.4s\n"
+    "ldr s27, [x22, x13]\n"
+    "fmla v17.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v13.4s, v30.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v15.4s, v30.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla v12.4s, v30.4s, v7.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x21, x18]\n"
+    "fmla v11.4s, v30.4s, v16.4s\n"
+    "ldr s21, [x21, x15]\n"
+    "fmla v15.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr s20, [x20, x17]\n"
+    "fmla v10.4s, v28.4s, v2.4s\n"
+    "ldr s19, [x23, x13]\n"
+    "fmla v13.4s, v24.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v12.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v10.4s, v24.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v9.4s, v24.4s, v5.4s\n"
+    "ldr s23, [x22, x15]\n"
+    "fmla v17.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v13.4s, v18.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v15.4s, v18.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "str s17, [%[outptr0]]\n"
+    "fmla v10.4s, v18.4s, v6.4s\n"
+    "fmla v12.4s, v18.4s, v4.4s\n"
+    "ldr s17, [x21, x17]\n"
+    "fmla v14.4s, v18.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x19]\n"
+    "fmla v9.4s, v18.4s, v7.4s\n"
+    "prfm pldl1keep, [%[inptr0], x14]\n"
+    "fmla v11.4s, v18.4s, v5.4s\n"
+    "add x20, x20, #4\n"
+    "fmla v8.4s, v18.4s, v16.4s\n"
+    "ldr s24, [x23, x15]\n"
+    "fmla v15.4s, v22.4s, v3.4s\n"
+    "ldr s18, [x22, x17]\n"
+    "fmla v12.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "fmla v14.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x19]\n"
+    "fmla v11.4s, v22.4s, v7.4s\n"
+    "ldr s22, [x23, x17]\n"
+    "fmla v10.4s, v26.4s, v1.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v14.4s, v25.4s, v6.4s\n"
+    "ldr s25, [%[wbptr]]\n"
+    "fmla v9.4s, v26.4s, v2.4s\n"
+    "ldr s16, [%[wbptr], #4]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "fmla v10.4s, v27.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x19]\n"
+    "fmla v12.4s, v27.4s, v1.4s\n"
+    "add x22, x22, #4\n"
+    "str s13, [x24]\n"
+    "fmla v9.4s, v27.4s, v4.4s\n"
+    "fmla v11.4s, v27.4s, v2.4s\n"
+    "ldr s26, [%[inptr0]]\n"
+    "fmla v8.4s, v27.4s, v5.4s\n"
+    "ldr s28, [x20]\n"
+    "fmla v15.4s, v21.4s, v0.4s\n"
+    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v14.4s, v21.4s, v1.4s\n"
+    "add x23, x23, #4\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "ldr s5, [%[wbptr], #16]\n"
+    "fmla v8.4s, v21.4s, v7.4s\n"
+    "ldr s27, [x21]\n"
+    "fmla v14.4s, v20.4s, v3.4s\n"
+    "ldr s21, [x20, %[input_col_stride1]]\n"
+    "fmla v11.4s, v20.4s, v6.4s\n"
+    "ldr s20, [%[inptr0], x13]\n"
+    "fmla v10.4s, v19.4s, v0.4s\n"
+    "subs x27, x27, #1\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v8.4s, v19.4s, v2.4s\n"
+    "fmla v12.4s, v23.4s, v0.4s\n"
+    "ldr s7, [%[wbptr], #8]\n"
+    "str s10, [x25]\n"
+    "fmla v11.4s, v23.4s, v1.4s\n"
+    "fmla v9.4s, v23.4s, v3.4s\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "str s12, [x24, %[output_col_stride1]]\n"
+    "fmla v8.4s, v23.4s, v4.4s\n"
+    "fmla v14.4s, v17.4s, v0.4s\n"
+    "ldr s23, [x22]\n"
+    "fmla v11.4s, v17.4s, v3.4s\n"
+    "ldr s19, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v17.4s, v6.4s\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "str s14, [%[outptr0], x26]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "fmla v11.4s, v18.4s, v0.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v8.4s, v24.4s, v1.4s\n"
+    "ldr s6, [%[wbptr], #12]\n"
+    "str s9, [x25, %[output_col_stride1]]\n"
+    "mov v17.16b, v25.16b\n"
+    "str s11, [x24, x26]\n"
+    "mov v13.16b, v25.16b\n"
+    "fmla v8.4s, v18.4s, v3.4s\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "mov v15.16b, v25.16b\n"
+    "add x24, x24, #4\n"
+    "mov v10.16b, v25.16b\n"
+    "mov v12.16b, v25.16b\n"
+    "fmla v8.4s, v22.4s, v0.4s\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "mov v14.16b, v25.16b\n"
+    "mov v9.16b, v25.16b\n"
+    "mov v11.16b, v25.16b\n"
+    "fmla v17.4s, v26.4s, v16.4s\n"
+    "str s8, [x25, x26]\n"
+    "fmla v13.4s, v28.4s, v16.4s\n"
+    "mov v8.16b, v25.16b\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "fmla v17.4s, v28.4s, v5.4s\n"
+    "fmla v15.4s, v29.4s, v16.4s\n"
+    "add x25, x25, #4\n"
+    "fmla v17.4s, v29.4s, v7.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "ldr s30, [x20, x13]\n"
+    "fmla v13.4s, v27.4s, v5.4s\n"
+    "ldr s29, [%[inptr0], x15]\n"
+    "fmla v10.4s, v27.4s, v16.4s\n"
+    "ldr s28, [x23]\n"
+    "fmla v17.4s, v21.4s, v4.4s\n"
+    "ldr s24, [x22, %[input_col_stride1]]\n"
+    "fmla v13.4s, v21.4s, v7.4s\n"
+    "ldr s18, [x21, x13]\n"
+    "fmla v15.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla v12.4s, v21.4s, v16.4s\n"
+    "ldr s22, [x20, x15]\n"
+    "fmla v17.4s, v20.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v15.4s, v20.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v14.4s, v20.4s, v16.4s\n"
+    "ldr s25, [%[inptr0], x17]\n"
+    "fmla v13.4s, v23.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x19]\n"
+    "fmla v10.4s, v23.4s, v5.4s\n"
+    "ldr s26, [x23, %[input_col_stride1]]\n"
+    "fmla v17.4s, v19.4s, v1.4s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla v13.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v15.4s, v19.4s, v2.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v10.4s, v19.4s, v7.4s\n"
+    "prfm pldl1keep, [x23, x19]\n"
+    "fmla v12.4s, v19.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla v9.4s, v19.4s, v16.4s\n"
+    "ldr s27, [x22, x13]\n"
+    "fmla v17.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v13.4s, v30.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v15.4s, v30.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla v12.4s, v30.4s, v7.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x21, x18]\n"
+    "fmla v11.4s, v30.4s, v16.4s\n"
+    "ldr s21, [x21, x15]\n"
+    "fmla v15.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr s20, [x20, x17]\n"
+    "fmla v10.4s, v28.4s, v2.4s\n"
+    "ldr s19, [x23, x13]\n"
+    "fmla v13.4s, v24.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v12.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v10.4s, v24.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v9.4s, v24.4s, v5.4s\n"
+    "ldr s23, [x22, x15]\n"
+    "fmla v17.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v13.4s, v18.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v15.4s, v18.4s, v1.4s\n"
+    "add x20, x20, #4\n"
+    "str s17, [%[outptr0]]\n"
+    "fmla v10.4s, v18.4s, v6.4s\n"
+    "fmla v12.4s, v18.4s, v4.4s\n"
+    "ldr s17, [x21, x17]\n"
+    "fmla v14.4s, v18.4s, v2.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v9.4s, v18.4s, v7.4s\n"
+    "fmla v11.4s, v18.4s, v5.4s\n"
+    "fmla v8.4s, v18.4s, v16.4s\n"
+    "ldr s24, [x23, x15]\n"
+    "fmla v15.4s, v22.4s, v3.4s\n"
+    "ldr s18, [x22, x17]\n"
+    "fmla v12.4s, v22.4s, v6.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v14.4s, v22.4s, v4.4s\n"
+    "fmla v11.4s, v22.4s, v7.4s\n"
+    "fmla v10.4s, v26.4s, v1.4s\n"
+    "ldr s22, [x23, x17]\n"
+    "fmla v9.4s, v26.4s, v2.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v14.4s, v25.4s, v6.4s\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "fmla v10.4s, v27.4s, v3.4s\n"
+    "fmla v12.4s, v27.4s, v1.4s\n"
+    "fmla v9.4s, v27.4s, v4.4s\n"
+    "fmla v11.4s, v27.4s, v2.4s\n"
+    "str s13, [x24]\n"
+    "fmla v8.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v21.4s, v0.4s\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "fmla v14.4s, v21.4s, v1.4s\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "fmla v8.4s, v21.4s, v7.4s\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v10.4s, v19.4s, v0.4s\n"
+    "fmla v14.4s, v20.4s, v3.4s\n"
+    "fmla v9.4s, v19.4s, v1.4s\n"
+    "fmla v11.4s, v20.4s, v6.4s\n"
+    "fmla v8.4s, v19.4s, v2.4s\n"
+    "str s10, [x25]\n"
+    "fmla v12.4s, v23.4s, v0.4s\n"
+    "fmla v9.4s, v23.4s, v3.4s\n"
+    "fmla v14.4s, v17.4s, v0.4s\n"
+    "fmla v11.4s, v23.4s, v1.4s\n"
+    "fmla v8.4s, v23.4s, v4.4s\n"
+    "str s12, [x24, %[output_col_stride1]]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "str s14, [%[outptr0], x26]\n"
+    "fmla v11.4s, v17.4s, v3.4s\n"
+    "fmla v8.4s, v17.4s, v6.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "str s9, [x25, %[output_col_stride1]]\n"
+    "fmla v11.4s, v18.4s, v0.4s\n"
+    "fmla v8.4s, v24.4s, v1.4s\n"
+    "str s11, [x24, x26]\n"
+    "fmla v8.4s, v18.4s, v3.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v8.4s, v22.4s, v0.4s\n"
+    "str s8, [x25, x26]\n"
+    "add x25, x25, #4\n"
+    "7:\n"
+    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-  int channels_remaining = n_channels;
-  if (channels_remaining >= 4)
-  {
-    // Process blocks of 4 channels at a time
-    int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
-    const bool odd_tail = (channels_remaining / 4) & 1;
-    channels_remaining %= 4;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x25, %[inptr0], %[input_row_stride]\n"
+    "add x16, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x21, %[outptr0], %[output_row_stride]\n"
+    "add x22, x25, %[input_row_stride]\n"
+    "add x23, x16, #64\n"
+    "add x26, x16, %[input_col_stride1]\n"
+    "add x13, x22, %[input_row_stride]\n"
+    "add x20, x26, #64\n"
+    "add x18, x26, %[input_col_stride1]\n"
+    "add x24, x13, %[input_row_stride]\n"
+    "add x15, x18, #64\n"
+    "add x14, x21, %[output_row_stride]\n"
+    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x27, %[n_channels], #3\n"
+    "lsr x28, %[n_channels], #2\n"
+    "cbz x28, 4f\n"
+    "1:\n"
+    "ldr q20, [%[wbptr]]\n"
+    "subs x28, x28, #1\n"
+    "mov v4.16b, v20.16b\n"
+    "ldr q15, [%[wbptr], #16]\n"
+    "mov v1.16b, v20.16b\n"
+    "ldr q0, [%[wbptr], #32]\n"
+    "mov v3.16b, v20.16b\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "mov v7.16b, v20.16b\n"
+    "ldr q16, [%[wbptr], #64]\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr q12, [%[wbptr], #80]\n"
+    "mov v2.16b, v20.16b\n"
+    "ldr q17, [%[wbptr], #96]\n"
+    "mov v6.16b, v20.16b\n"
+    "ldr q11, [%[wbptr], #112]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr q10, [%[wbptr], #128]\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr q14, [%[wbptr], #144]\n"
+    "ldr q27, [%[inptr0]]\n"
+    "ldr q24, [x25]\n"
+    "fmla v4.4s, v27.4s, v15.4s\n"
+    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q21, [x22]\n"
+    "ldr q19, [x25, %[input_col_stride1]]\n"
+    "ldr q31, [%[inptr0], x16]\n"
+    "ldr q28, [x13]\n"
+    "fmla v4.4s, v24.4s, v16.4s\n"
+    "ldr q18, [x22, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x25, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x17]\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "prfm pldl1keep, [x25, x17]\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "prfm pldl1keep, [x13, #64]\n"
+    "prfm pldl1keep, [x22, x17]\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v1.4s, v24.4s, v15.4s\n"
+    "ldr q24, [x25, x16]\n"
+    "fmla v4.4s, v22.4s, v0.4s\n"
+    "ldr q29, [%[inptr0], x26]\n"
+    "fmla v3.4s, v22.4s, v15.4s\n"
+    "ldr q30, [x24]\n"
+    "fmla v1.4s, v21.4s, v16.4s\n"
+    "ldr q25, [x13, %[input_col_stride1]]\n"
+    "fmla v4.4s, v21.4s, v11.4s\n"
+    "prfm pldl1keep, [x25, x23]\n"
+    "fmla v7.4s, v21.4s, v15.4s\n"
+    "ldr q26, [x22, x16]\n"
+    "fmla v1.4s, v19.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v4.4s, v19.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v3.4s, v19.4s, v16.4s\n"
+    "prfm pldl1keep, [x13, x17]\n"
+    "fmla v9.4s, v19.4s, v15.4s\n"
+    "ldr q23, [x25, x26]\n"
+    "fmla v4.4s, v31.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x23]\n"
+    "fmla v3.4s, v31.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x20]\n"
+    "fmla v2.4s, v31.4s, v15.4s\n"
+    "ldr q20, [%[inptr0], x18]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "ldr q28, [x24, %[input_col_stride1]]\n"
+    "fmla v4.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x17]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "prfm pldl1keep, [x13, x23]\n"
+    "fmla v3.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x22, x20]\n"
+    "fmla v7.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x15]\n"
+    "fmla v9.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x23]\n"
+    "fmla v6.4s, v18.4s, v15.4s\n"
+    "ldr q27, [x13, x16]\n"
+    "fmla v4.4s, v24.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, x20]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x15]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, x20]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "prfm pldl1keep, [x13, x15]\n"
+    "fmla v2.4s, v24.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v8.4s, v24.4s, v15.4s\n"
+    "ldr q24, [x22, x26]\n"
+    "fmla v3.4s, v29.4s, v13.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v29.4s, v0.4s\n"
+    "ldr q22, [x25, x18]\n"
+    "fmla v7.4s, v30.4s, v11.4s\n"
+    "ldr q21, [x24, x16]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v25.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v7.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v6.4s, v25.4s, v16.4s\n"
+    "ldr q19, [x13, x26]\n"
+    "fmla v4.4s, v26.4s, v14.4s\n"
+    "prfm pldl1keep, [%[inptr0], x17]\n"
+    "fmla v1.4s, v26.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v3.4s, v26.4s, v10.4s\n"
+    "add x25, x25, #16\n"
+    "fmla v7.4s, v26.4s, v13.4s\n"
+    "prfm pldl1keep, [x25, #64]\n"
+    "fmla v9.4s, v26.4s, v12.4s\n"
+    "prfm pldl1keep, [x25, x17]\n"
+    "fmla v2.4s, v26.4s, v11.4s\n"
+    "subs x28, x28, #1\n"
+    "fmla v6.4s, v26.4s, v0.4s\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "fmla v5.4s, v26.4s, v15.4s\n"
+    "ldr q26, [x22, x18]\n"
+    "fmla v3.4s, v23.4s, v17.4s\n"
+    "ldr q18, [x24, x26]\n"
+    "fmla v9.4s, v23.4s, v13.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v8.4s, v23.4s, v0.4s\n"
+    "ldr q23, [x13, x18]\n"
+    "fmla v7.4s, v28.4s, v10.4s\n"
+    "prfm pldl1keep, [x22, x17]\n"
+    "fmla v2.4s, v20.4s, v13.4s\n"
+    "ldr q25, [x24, x18]\n"
+    "fmla v6.4s, v28.4s, v11.4s\n"
+    "ldr q20, [%[wbptr]]\n"
+    "fmla v1.4s, v27.4s, v14.4s\n"
+    "add x13, x13, #16\n"
+    "fmla v7.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, #64]\n"
+    "fmla v9.4s, v27.4s, v10.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v6.4s, v27.4s, v12.4s\n"
+    "fmla v8.4s, v27.4s, v11.4s\n"
+    "fmla v5.4s, v27.4s, v16.4s\n"
+    "ldr q15, [%[wbptr], #16]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "ldr q27, [%[inptr0]]\n"
+    "fmla v9.4s, v24.4s, v17.4s\n"
+    "fmla v2.4s, v24.4s, v10.4s\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "fmla v8.4s, v24.4s, v12.4s\n"
+    "fmla v5.4s, v24.4s, v0.4s\n"
+    "ldr q16, [%[wbptr], #64]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "ldr q24, [x25]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v7.4s, v21.4s, v14.4s\n"
+    "fmla v6.4s, v21.4s, v10.4s\n"
+    "fmla v5.4s, v21.4s, v11.4s\n"
+    "ldr q0, [%[wbptr], #32]\n"
+    "fmla v9.4s, v19.4s, v14.4s\n"
+    "ldr q21, [x22]\n"
+    "fmla v6.4s, v19.4s, v17.4s\n"
+    "fmla v8.4s, v19.4s, v10.4s\n"
+    "fmla v5.4s, v19.4s, v12.4s\n"
+    "ldr q11, [%[wbptr], #112]\n"
+    "fmla v2.4s, v26.4s, v14.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v8.4s, v26.4s, v17.4s\n"
+    "fmla v6.4s, v18.4s, v14.4s\n"
+    "fmla v5.4s, v26.4s, v13.4s\n"
+    "ldr q12, [%[wbptr], #80]\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "ldr q19, [x25, %[input_col_stride1]]\n"
+    "fmla v8.4s, v23.4s, v14.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "str q4, [%[outptr0]]\n"
+    "fmla v5.4s, v18.4s, v10.4s\n"
+    "str q3, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "str q2, [%[outptr0], x19]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "str q1, [x21]\n"
+    "fmax v9.4s, v9.4s, v29.4s\n"
+    "fmax v8.4s, v8.4s, v29.4s\n"
+    "ldr q10, [%[wbptr], #128]\n"
+    "str q9, [x21, %[output_col_stride1]]\n"
+    "fmla v5.4s, v25.4s, v14.4s\n"
+    "str q8, [x21, x19]\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "ldr q17, [%[wbptr], #96]\n"
+    "str q7, [x14]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "str q6, [x14, %[output_col_stride1]]\n"
+    "mov v4.16b, v20.16b\n"
+    "str q5, [x14, x19]\n"
+    "mov v1.16b, v20.16b\n"
+    "mov v3.16b, v20.16b\n"
+    "ldr q14, [%[wbptr], #144]\n"
+    "mov v7.16b, v20.16b\n"
+    "ldr q31, [%[inptr0], x16]\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr q28, [x13]\n"
+    "mov v2.16b, v20.16b\n"
+    "ldr q18, [x22, %[input_col_stride1]]\n"
+    "mov v6.16b, v20.16b\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "mov v8.16b, v20.16b\n"
+    "add x21, x21, #16\n"
+    "mov v5.16b, v20.16b\n"
+    "add x14, x14, #16\n"
+    "fmla v4.4s, v27.4s, v15.4s\n"
+    "fmla v4.4s, v24.4s, v16.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v1.4s, v24.4s, v15.4s\n"
+    "ldr q24, [x25, x16]\n"
+    "fmla v4.4s, v22.4s, v0.4s\n"
+    "ldr q29, [%[inptr0], x26]\n"
+    "fmla v3.4s, v22.4s, v15.4s\n"
+    "ldr q30, [x24]\n"
+    "fmla v1.4s, v21.4s, v16.4s\n"
+    "ldr q25, [x13, %[input_col_stride1]]\n"
+    "fmla v4.4s, v21.4s, v11.4s\n"
+    "prfm pldl1keep, [x25, x23]\n"
+    "fmla v7.4s, v21.4s, v15.4s\n"
+    "ldr q26, [x22, x16]\n"
+    "fmla v1.4s, v19.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v4.4s, v19.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v3.4s, v19.4s, v16.4s\n"
+    "prfm pldl1keep, [x13, x17]\n"
+    "fmla v9.4s, v19.4s, v15.4s\n"
+    "ldr q23, [x25, x26]\n"
+    "fmla v4.4s, v31.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x23]\n"
+    "fmla v3.4s, v31.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x20]\n"
+    "fmla v2.4s, v31.4s, v15.4s\n"
+    "ldr q20, [%[inptr0], x18]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "ldr q28, [x24, %[input_col_stride1]]\n"
+    "fmla v4.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x17]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "prfm pldl1keep, [x13, x23]\n"
+    "fmla v3.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x22, x20]\n"
+    "fmla v7.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x15]\n"
+    "fmla v9.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x23]\n"
+    "fmla v6.4s, v18.4s, v15.4s\n"
+    "ldr q27, [x13, x16]\n"
+    "fmla v4.4s, v24.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, x20]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x15]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, x20]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "prfm pldl1keep, [x13, x15]\n"
+    "fmla v2.4s, v24.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v8.4s, v24.4s, v15.4s\n"
+    "ldr q24, [x22, x26]\n"
+    "fmla v3.4s, v29.4s, v13.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v2.4s, v29.4s, v0.4s\n"
+    "ldr q22, [x25, x18]\n"
+    "fmla v7.4s, v30.4s, v11.4s\n"
+    "ldr q21, [x24, x16]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v25.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v7.4s, v25.4s, v12.4s\n"
+    "add x25, x25, #16\n"
+    "fmla v6.4s, v25.4s, v16.4s\n"
+    "ldr q19, [x13, x26]\n"
+    "fmla v4.4s, v26.4s, v14.4s\n"
+    "fmla v1.4s, v26.4s, v17.4s\n"
+    "fmla v3.4s, v26.4s, v10.4s\n"
+    "fmla v7.4s, v26.4s, v13.4s\n"
+    "fmla v9.4s, v26.4s, v12.4s\n"
+    "fmla v2.4s, v26.4s, v11.4s\n"
+    "fmla v6.4s, v26.4s, v0.4s\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "fmla v5.4s, v26.4s, v15.4s\n"
+    "ldr q26, [x22, x18]\n"
+    "fmla v3.4s, v23.4s, v17.4s\n"
+    "ldr q18, [x24, x26]\n"
+    "fmla v9.4s, v23.4s, v13.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "fmla v8.4s, v23.4s, v0.4s\n"
+    "fmla v7.4s, v28.4s, v10.4s\n"
+    "ldr q23, [x13, x18]\n"
+    "fmla v6.4s, v28.4s, v11.4s\n"
+    "ldr q25, [x24, x18]\n"
+    "fmla v2.4s, v20.4s, v13.4s\n"
+    "add x13, x13, #16\n"
+    "fmla v1.4s, v27.4s, v14.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v7.4s, v27.4s, v17.4s\n"
+    "fmla v9.4s, v27.4s, v10.4s\n"
+    "fmla v6.4s, v27.4s, v12.4s\n"
+    "fmla v8.4s, v27.4s, v11.4s\n"
+    "fmla v5.4s, v27.4s, v16.4s\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "fmla v9.4s, v24.4s, v17.4s\n"
+    "fmla v2.4s, v24.4s, v10.4s\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "fmla v8.4s, v24.4s, v12.4s\n"
+    "fmla v5.4s, v24.4s, v0.4s\n"
+    "fmla v7.4s, v21.4s, v14.4s\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "fmla v9.4s, v19.4s, v14.4s\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "fmla v6.4s, v21.4s, v10.4s\n"
+    "fmla v5.4s, v21.4s, v11.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v2.4s, v26.4s, v14.4s\n"
+    "fmla v6.4s, v19.4s, v17.4s\n"
+    "fmla v8.4s, v19.4s, v10.4s\n"
+    "fmla v5.4s, v19.4s, v12.4s\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "fmla v6.4s, v18.4s, v14.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "str q4, [%[outptr0]]\n"
+    "fmla v8.4s, v26.4s, v17.4s\n"
+    "str q3, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v5.4s, v26.4s, v13.4s\n"
+    "str q2, [%[outptr0], x19]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "fmla v8.4s, v23.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "str q1, [x21]\n"
+    "fmla v5.4s, v18.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v29.4s\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v8.4s, v8.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "str q9, [x21, %[output_col_stride1]]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "str q8, [x21, x19]\n"
+    "str q7, [x14]\n"
+    "str q6, [x14, %[output_col_stride1]]\n"
+    "add x21, x21, #16\n"
+    "fmla v5.4s, v25.4s, v14.4s\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "str q5, [x14, x19]\n"
+    "add x14, x14, #16\n"
+    "4:\n"
+    "cbz x27, 7f\n"
+    "ldr s20, [%[wbptr]]\n"
+    "mov v4.16b, v20.16b\n"
+    "ldr s15, [%[wbptr], #4]\n"
+    "mov v1.16b, v20.16b\n"
+    "ldr s0, [%[wbptr], #8]\n"
+    "mov v3.16b, v20.16b\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "mov v7.16b, v20.16b\n"
+    "ldr s16, [%[wbptr], #16]\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr s12, [%[wbptr], #20]\n"
+    "mov v2.16b, v20.16b\n"
+    "ldr s17, [%[wbptr], #24]\n"
+    "mov v6.16b, v20.16b\n"
+    "ldr s11, [%[wbptr], #28]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr s10, [%[wbptr], #32]\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr s14, [%[wbptr], #36]\n"
+    "ldr s27, [%[inptr0]]\n"
+    "subs x27, x27, #1\n"
+    "fmla v4.4s, v27.4s, v15.4s\n"
+    "ldr s24, [x25]\n"
+    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s21, [x22]\n"
+    "ldr s19, [x25, %[input_col_stride1]]\n"
+    "ldr s31, [%[inptr0], x16]\n"
+    "fmla v4.4s, v24.4s, v16.4s\n"
+    "ldr s28, [x13]\n"
+    "ldr s18, [x22, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x25, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x17]\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "prfm pldl1keep, [x25, x17]\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "prfm pldl1keep, [x13, #64]\n"
+    "prfm pldl1keep, [x22, x17]\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v1.4s, v24.4s, v15.4s\n"
+    "ldr s24, [x25, x16]\n"
+    "fmla v4.4s, v22.4s, v0.4s\n"
+    "ldr s29, [%[inptr0], x26]\n"
+    "fmla v3.4s, v22.4s, v15.4s\n"
+    "ldr s30, [x24]\n"
+    "fmla v1.4s, v21.4s, v16.4s\n"
+    "ldr s25, [x13, %[input_col_stride1]]\n"
+    "fmla v4.4s, v21.4s, v11.4s\n"
+    "prfm pldl1keep, [x25, x23]\n"
+    "fmla v7.4s, v21.4s, v15.4s\n"
+    "ldr s26, [x22, x16]\n"
+    "fmla v1.4s, v19.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v4.4s, v19.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v3.4s, v19.4s, v16.4s\n"
+    "prfm pldl1keep, [x13, x17]\n"
+    "fmla v9.4s, v19.4s, v15.4s\n"
+    "ldr s23, [x25, x26]\n"
+    "fmla v4.4s, v31.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x23]\n"
+    "fmla v3.4s, v31.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x20]\n"
+    "fmla v2.4s, v31.4s, v15.4s\n"
+    "ldr s20, [%[inptr0], x18]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "ldr s28, [x24, %[input_col_stride1]]\n"
+    "fmla v4.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x17]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "prfm pldl1keep, [x13, x23]\n"
+    "fmla v3.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x22, x20]\n"
+    "fmla v7.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x15]\n"
+    "fmla v9.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x23]\n"
+    "fmla v6.4s, v18.4s, v15.4s\n"
+    "ldr s27, [x13, x16]\n"
+    "fmla v4.4s, v24.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, x20]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x15]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, x20]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "prfm pldl1keep, [x13, x15]\n"
+    "fmla v2.4s, v24.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v8.4s, v24.4s, v15.4s\n"
+    "ldr s24, [x22, x26]\n"
+    "fmla v3.4s, v29.4s, v13.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v29.4s, v0.4s\n"
+    "ldr s22, [x25, x18]\n"
+    "fmla v7.4s, v30.4s, v11.4s\n"
+    "ldr s21, [x24, x16]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v25.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v7.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v6.4s, v25.4s, v16.4s\n"
+    "ldr s19, [x13, x26]\n"
+    "fmla v4.4s, v26.4s, v14.4s\n"
+    "prfm pldl1keep, [%[inptr0], x17]\n"
+    "fmla v1.4s, v26.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v3.4s, v26.4s, v10.4s\n"
+    "add x25, x25, #4\n"
+    "fmla v7.4s, v26.4s, v13.4s\n"
+    "prfm pldl1keep, [x25, #64]\n"
+    "fmla v9.4s, v26.4s, v12.4s\n"
+    "prfm pldl1keep, [x25, x17]\n"
+    "fmla v2.4s, v26.4s, v11.4s\n"
+    "subs x27, x27, #1\n"
+    "fmla v6.4s, v26.4s, v0.4s\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "fmla v5.4s, v26.4s, v15.4s\n"
+    "ldr s26, [x22, x18]\n"
+    "fmla v3.4s, v23.4s, v17.4s\n"
+    "ldr s18, [x24, x26]\n"
+    "fmla v9.4s, v23.4s, v13.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v8.4s, v23.4s, v0.4s\n"
+    "ldr s23, [x13, x18]\n"
+    "fmla v7.4s, v28.4s, v10.4s\n"
+    "prfm pldl1keep, [x22, x17]\n"
+    "fmla v2.4s, v20.4s, v13.4s\n"
+    "ldr s25, [x24, x18]\n"
+    "fmla v6.4s, v28.4s, v11.4s\n"
+    "ldr s20, [%[wbptr]]\n"
+    "fmla v1.4s, v27.4s, v14.4s\n"
+    "add x13, x13, #4\n"
+    "fmla v7.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, #64]\n"
+    "fmla v9.4s, v27.4s, v10.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v6.4s, v27.4s, v12.4s\n"
+    "fmla v8.4s, v27.4s, v11.4s\n"
+    "fmla v5.4s, v27.4s, v16.4s\n"
+    "ldr s15, [%[wbptr], #4]\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "ldr s27, [%[inptr0]]\n"
+    "fmla v9.4s, v24.4s, v17.4s\n"
+    "fmla v2.4s, v24.4s, v10.4s\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "fmla v8.4s, v24.4s, v12.4s\n"
+    "fmla v5.4s, v24.4s, v0.4s\n"
+    "ldr s16, [%[wbptr], #16]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "ldr s24, [x25]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v7.4s, v21.4s, v14.4s\n"
+    "fmla v6.4s, v21.4s, v10.4s\n"
+    "fmla v5.4s, v21.4s, v11.4s\n"
+    "ldr s0, [%[wbptr], #8]\n"
+    "fmla v9.4s, v19.4s, v14.4s\n"
+    "ldr s21, [x22]\n"
+    "fmla v6.4s, v19.4s, v17.4s\n"
+    "fmla v8.4s, v19.4s, v10.4s\n"
+    "fmla v5.4s, v19.4s, v12.4s\n"
+    "ldr s11, [%[wbptr], #28]\n"
+    "fmla v2.4s, v26.4s, v14.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v8.4s, v26.4s, v17.4s\n"
+    "fmla v6.4s, v18.4s, v14.4s\n"
+    "fmla v5.4s, v26.4s, v13.4s\n"
+    "ldr s12, [%[wbptr], #20]\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "ldr s19, [x25, %[input_col_stride1]]\n"
+    "fmla v8.4s, v23.4s, v14.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "str s4, [%[outptr0]]\n"
+    "fmla v5.4s, v18.4s, v10.4s\n"
+    "str s3, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "str s2, [%[outptr0], x19]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "str s1, [x21]\n"
+    "fmax v9.4s, v9.4s, v29.4s\n"
+    "fmax v8.4s, v8.4s, v29.4s\n"
+    "ldr s10, [%[wbptr], #32]\n"
+    "str s9, [x21, %[output_col_stride1]]\n"
+    "fmla v5.4s, v25.4s, v14.4s\n"
+    "str s8, [x21, x19]\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "ldr s17, [%[wbptr], #24]\n"
+    "str s7, [x14]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "str s6, [x14, %[output_col_stride1]]\n"
+    "mov v4.16b, v20.16b\n"
+    "str s5, [x14, x19]\n"
+    "mov v1.16b, v20.16b\n"
+    "mov v3.16b, v20.16b\n"
+    "ldr s14, [%[wbptr], #36]\n"
+    "mov v7.16b, v20.16b\n"
+    "ldr s31, [%[inptr0], x16]\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr s28, [x13]\n"
+    "mov v2.16b, v20.16b\n"
+    "ldr s18, [x22, %[input_col_stride1]]\n"
+    "mov v6.16b, v20.16b\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "mov v8.16b, v20.16b\n"
+    "add x21, x21, #4\n"
+    "mov v5.16b, v20.16b\n"
+    "add x14, x14, #4\n"
+    "fmla v4.4s, v27.4s, v15.4s\n"
+    "fmla v4.4s, v24.4s, v16.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v1.4s, v24.4s, v15.4s\n"
+    "ldr s24, [x25, x16]\n"
+    "fmla v4.4s, v22.4s, v0.4s\n"
+    "ldr s29, [%[inptr0], x26]\n"
+    "fmla v3.4s, v22.4s, v15.4s\n"
+    "ldr s30, [x24]\n"
+    "fmla v1.4s, v21.4s, v16.4s\n"
+    "ldr s25, [x13, %[input_col_stride1]]\n"
+    "fmla v4.4s, v21.4s, v11.4s\n"
+    "prfm pldl1keep, [x25, x23]\n"
+    "fmla v7.4s, v21.4s, v15.4s\n"
+    "ldr s26, [x22, x16]\n"
+    "fmla v1.4s, v19.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v4.4s, v19.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v3.4s, v19.4s, v16.4s\n"
+    "prfm pldl1keep, [x13, x17]\n"
+    "fmla v9.4s, v19.4s, v15.4s\n"
+    "ldr s23, [x25, x26]\n"
+    "fmla v4.4s, v31.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x23]\n"
+    "fmla v3.4s, v31.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x20]\n"
+    "fmla v2.4s, v31.4s, v15.4s\n"
+    "ldr s20, [%[inptr0], x18]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "ldr s28, [x24, %[input_col_stride1]]\n"
+    "fmla v4.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x17]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "prfm pldl1keep, [x13, x23]\n"
+    "fmla v3.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x22, x20]\n"
+    "fmla v7.4s, v18.4s, v0.4s\n"
+    "prfm pldl1keep, [x25, x15]\n"
+    "fmla v9.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x23]\n"
+    "fmla v6.4s, v18.4s, v15.4s\n"
+    "ldr s27, [x13, x16]\n"
+    "fmla v4.4s, v24.4s, v17.4s\n"
+    "prfm pldl1keep, [x13, x20]\n"
+    "fmla v1.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x22, x15]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "prfm pldl1keep, [x24, x20]\n"
+    "fmla v9.4s, v24.4s, v0.4s\n"
+    "prfm pldl1keep, [x13, x15]\n"
+    "fmla v2.4s, v24.4s, v16.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v8.4s, v24.4s, v15.4s\n"
+    "ldr s24, [x22, x26]\n"
+    "fmla v3.4s, v29.4s, v13.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v2.4s, v29.4s, v0.4s\n"
+    "ldr s22, [x25, x18]\n"
+    "fmla v7.4s, v30.4s, v11.4s\n"
+    "ldr s21, [x24, x16]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v9.4s, v25.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v7.4s, v25.4s, v12.4s\n"
+    "add x25, x25, #4\n"
+    "fmla v6.4s, v25.4s, v16.4s\n"
+    "ldr s19, [x13, x26]\n"
+    "fmla v4.4s, v26.4s, v14.4s\n"
+    "fmla v1.4s, v26.4s, v17.4s\n"
+    "fmla v3.4s, v26.4s, v10.4s\n"
+    "fmla v7.4s, v26.4s, v13.4s\n"
+    "fmla v9.4s, v26.4s, v12.4s\n"
+    "fmla v2.4s, v26.4s, v11.4s\n"
+    "fmla v6.4s, v26.4s, v0.4s\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "fmla v5.4s, v26.4s, v15.4s\n"
+    "ldr s26, [x22, x18]\n"
+    "fmla v3.4s, v23.4s, v17.4s\n"
+    "ldr s18, [x24, x26]\n"
+    "fmla v9.4s, v23.4s, v13.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v2.4s, v23.4s, v12.4s\n"
+    "fmla v8.4s, v23.4s, v0.4s\n"
+    "fmla v7.4s, v28.4s, v10.4s\n"
+    "ldr s23, [x13, x18]\n"
+    "fmla v6.4s, v28.4s, v11.4s\n"
+    "ldr s25, [x24, x18]\n"
+    "fmla v2.4s, v20.4s, v13.4s\n"
+    "add x13, x13, #4\n"
+    "fmla v1.4s, v27.4s, v14.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v7.4s, v27.4s, v17.4s\n"
+    "fmla v9.4s, v27.4s, v10.4s\n"
+    "fmla v6.4s, v27.4s, v12.4s\n"
+    "fmla v8.4s, v27.4s, v11.4s\n"
+    "fmla v5.4s, v27.4s, v16.4s\n"
+    "fmla v3.4s, v24.4s, v14.4s\n"
+    "fmla v9.4s, v24.4s, v17.4s\n"
+    "fmla v2.4s, v24.4s, v10.4s\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "fmla v8.4s, v24.4s, v12.4s\n"
+    "fmla v5.4s, v24.4s, v0.4s\n"
+    "fmla v7.4s, v21.4s, v14.4s\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "fmla v9.4s, v19.4s, v14.4s\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "fmla v6.4s, v21.4s, v10.4s\n"
+    "fmla v5.4s, v21.4s, v11.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v2.4s, v26.4s, v14.4s\n"
+    "fmla v6.4s, v19.4s, v17.4s\n"
+    "fmla v8.4s, v19.4s, v10.4s\n"
+    "fmla v5.4s, v19.4s, v12.4s\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "fmla v6.4s, v18.4s, v14.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "str s4, [%[outptr0]]\n"
+    "fmla v8.4s, v26.4s, v17.4s\n"
+    "str s3, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v5.4s, v26.4s, v13.4s\n"
+    "str s2, [%[outptr0], x19]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "fmla v8.4s, v23.4s, v14.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "str s1, [x21]\n"
+    "fmla v5.4s, v18.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v29.4s\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v8.4s, v8.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "str s9, [x21, %[output_col_stride1]]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "str s8, [x21, x19]\n"
+    "str s7, [x14]\n"
+    "str s6, [x14, %[output_col_stride1]]\n"
+    "add x21, x21, #4\n"
+    "fmla v5.4s, v25.4s, v14.4s\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "str s5, [x14, x19]\n"
+    "add x14, x14, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-    asm volatile (
-        "qU22B .req q0\n" "qU23B .req q0\n" "qW22A .req q0\n"
-        "vU22B .req v0\n" "vU23B .req v0\n" "vW22A .req v0\n"
-        "qV12A .req q1\n" "qW11B .req q1\n"
-        "vV12A .req v1\n" "vW11B .req v1\n"
-        "qU41A .req q2\n" "qU32B .req q2\n" "qU33A .req q2\n" "qV13B .req q2\n"
-        "vU41A .req v2\n" "vU32B .req v2\n" "vU33A .req v2\n" "vV13B .req v2\n"
-        "qU42B .req q3\n" "qU13B .req q3\n" "qU44B .req q3\n" "qU55A .req q3\n"
-        "vU42B .req v3\n" "vU13B .req v3\n" "vU44B .req v3\n" "vU55A .req v3\n"
-        "qU34B .req q4\n" "qU15A .req q4\n" "qU42A .req q4\n" "qU44A .req q4\n" "qU12B .req q4\n"
-        "vU34B .req v4\n" "vU15A .req v4\n" "vU42A .req v4\n" "vU44A .req v4\n" "vU12B .req v4\n"
-        "qU33B .req q5\n" "qU52A .req q5\n" "qW23A .req q5\n"
-        "vU33B .req v5\n" "vU52A .req v5\n" "vW23A .req v5\n"
-        "qV31A .req q6\n" "qU13A .req q6\n" "qV12B .req q6\n"
-        "vV31A .req v6\n" "vU13A .req v6\n" "vV12B .req v6\n"
-        "qU35B .req q7\n" "qU51B .req q7\n" "qV11A .req q7\n" "qU53B .req q7\n"
-        "vU35B .req v7\n" "vU51B .req v7\n" "vV11A .req v7\n" "vU53B .req v7\n"
-        "qW21A .req q8\n" "qV22B .req q8\n"
-        "vW21A .req v8\n" "vV22B .req v8\n"
-        "qV33B .req q9\n" "qU14A .req q9\n" "qV23A .req q9\n" "qU25B .req q9\n"
-        "vV33B .req v9\n" "vU14A .req v9\n" "vV23A .req v9\n" "vU25B .req v9\n"
-        "qW21B .req q10\n" "qV32A .req q10\n" "qU35A .req q10\n"
-        "vW21B .req v10\n" "vV32A .req v10\n" "vU35A .req v10\n"
-        "qV11B .req q11\n" "qU15B .req q11\n" "qV33A .req q11\n"
-        "vV11B .req v11\n" "vU15B .req v11\n" "vV33A .req v11\n"
-        "qU11B .req q12\n" "qW23B .req q12\n" "qU45A .req q12\n"
-        "vU11B .req v12\n" "vW23B .req v12\n" "vU45A .req v12\n"
-        "qW11A .req q13\n" "qU45B .req q13\n" "qU52B .req q13\n"
-        "vW11A .req v13\n" "vU45B .req v13\n" "vU52B .req v13\n"
-        "qU55B .req q14\n" "qU25A .req q14\n" "qV21A .req q14\n"
-        "vU55B .req v14\n" "vU25A .req v14\n" "vV21A .req v14\n"
-        "qU53A .req q15\n" "qV21B .req q15\n" "qU31A .req q15\n"
-        "vU53A .req v15\n" "vV21B .req v15\n" "vU31A .req v15\n"
-        "qW13B .req q16\n" "qU23A .req q16\n"
-        "vW13B .req v16\n" "vU23A .req v16\n"
-        "qW33B .req q17\n" "qW33A .req q17\n"
-        "vW33B .req v17\n" "vW33A .req v17\n"
-        "qU24B .req q18\n" "qU32A .req q18\n" "qV31B .req q18\n" "qV13A .req q18\n"
-        "vU24B .req v18\n" "vU32A .req v18\n" "vV31B .req v18\n" "vV13A .req v18\n"
-        "qU31B .req q19\n" "qU11A .req q19\n" "qU54B .req q19\n" "qU43A .req q19\n"
-        "vU31B .req v19\n" "vU11A .req v19\n" "vU54B .req v19\n" "vU43A .req v19\n"
-        "qU24A .req q20\n" "qW12B .req q20\n" "qU54A .req q20\n"
-        "vU24A .req v20\n" "vW12B .req v20\n" "vU54A .req v20\n"
-        "qV23B .req q21\n" "qW12A .req q21\n"
-        "vV23B .req v21\n" "vW12A .req v21\n"
-        "qW32A .req q22\n" "qU43B .req q22\n"
-        "vW32A .req v22\n" "vU43B .req v22\n"
-        "qW31A .req q23\n" "qV32B .req q23\n"
-        "vW31A .req v23\n" "vV32B .req v23\n"
-        "qU22A .req q24\n" "qW31B .req q24\n"
-        "vU22A .req v24\n" "vW31B .req v24\n"
-        "qU21B .req q25\n" "qV22A .req q25\n"
-        "vU21B .req v25\n" "vV22A .req v25\n"
-        "qU34A .req q26\n" "qW22B .req q26\n" "qU12A .req q26\n"
-        "vU34A .req v26\n" "vW22B .req v26\n" "vU12A .req v26\n"
-        "qW13A .req q27\n" "qU51A .req q27\n"
-        "vW13A .req v27\n" "vU51A .req v27\n"
-        "qW32B .req q28\n"
-        "vW32B .req v28\n"
-        "qU41B .req q29\n" "qU14B .req q29\n"
-        "vU41B .req v29\n" "vU14B .req v29\n"
-        "qU21A .req q30\n"
-        "vU21A .req v30\n"
-
-        "uptr1 .req x0\n"
-        "uptr2 .req x1\n"
-        "uptr3 .req x2\n"
-        "uptr4 .req x3\n"
-
-        "u_col_stride1 .req %x[u_col_stride]\n"
-        "u_col_stride2 .req x4\n"
-        "u_col_stride3 .req x5\n"
-        "u_col_stride4 .req x6\n"
-
-        "wptr1 .req x7\n"
-        "wptr2 .req x8\n"
-        "w_col_stride1 .req %x[w_col_stride]\n"
-        "w_col_stride2 .req x9\n"
-
-        "vptr1 .req x10\n"
-        "vptr2 .req x11\n"
-        "v_col_stride1 .req %x[v_col_stride]\n"
-        "v_col_stride2 .req x12\n"
-
-        // Prepare strides and pointers
-        "add uptr1, %x[uptr0], %x[u_row_stride]\n"
-        "add uptr2,    uptr1 , %x[u_row_stride]\n"
-        "add uptr3,    uptr2 , %x[u_row_stride]\n"
-        "add uptr4,    uptr3 , %x[u_row_stride]\n"
-        "add u_col_stride2, u_col_stride1, u_col_stride1\n"
-        "add u_col_stride3, u_col_stride2, u_col_stride1\n"
-        "add u_col_stride4, u_col_stride3, u_col_stride1\n"
-
-        "add wptr1, %x[wptr0], %x[w_row_stride]\n"
-        "add wptr2,    wptr1 , %x[w_row_stride]\n"
-        "add w_col_stride2, w_col_stride1, w_col_stride1\n"
-
-        "add vptr1, %x[vptr0], %x[v_row_stride]\n"
-        "add vptr2,    vptr1 , %x[v_row_stride]\n"
-        "add v_col_stride2, v_col_stride1, v_col_stride1\n"
-
-        // Pre-load for A
-        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
-        "ldr qW23A, [wptr1, w_col_stride2]\n"
-        "ldr qW33A, [wptr2, w_col_stride2]\n"
-        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
-        "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
-        "ldr qW22A, [wptr1, w_col_stride1]\n"
-        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
-        "ldr qW32A, [wptr2, w_col_stride1]\n"
-        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
-        "ldr qU25A, [uptr1, u_col_stride4]\n"
-        "ldr qU24A, [uptr1, u_col_stride3]\n"
-        "ldr qW11A, [%x[wptr0]], #0x10\n"
-        "ldr qU23A, [uptr1, u_col_stride2]\n"
-        "ldr qW21A, [wptr1], #0x10\n"
-        "ldr qW31A, [wptr2], #0x10\n"
-        "ldr qU34A, [uptr2, u_col_stride3]\n"
-        "ldr qU35A, [uptr2, u_col_stride4]\n"
-
-        // First part of A
-        "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
-        "ldr qU33A, [uptr2, u_col_stride2]\n"
-        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
-        "cbz %x[n_iters], 2f\n"  // Jump to tail if not looping
-
-        "1:"  // Main loop, double unrolled
-        // A Part
-        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
-        "ldr qU45A, [uptr3, u_col_stride4]\n"
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
-        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
-        "ldr qU55A, [uptr4, u_col_stride4]\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
-        "ldr qU54A, [uptr4, u_col_stride3]\n"
-        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
-        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
-        "ldr qU53A, [uptr4, u_col_stride2]\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
-        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
-        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
-        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
-        "str qV13A, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
-        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
-        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
-        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
-        "ldr qU22A, [uptr1, u_col_stride1]\n"
-        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
-        "ldr qU32A, [uptr2, u_col_stride1]\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
-        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
-        "str qV23A, [vptr1, v_col_stride2]\n"
-        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
-        "ldr qW23B, [wptr1, w_col_stride2]\n"
-        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
-        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
-        "ldr qU52A, [uptr4, u_col_stride1]\n"
-        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
-        "ldr qW33B, [wptr2, w_col_stride2]\n"
-        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
-        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
-        "str qV33A, [vptr2, v_col_stride2]\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU51A, [uptr4], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "ldr qW22B, [wptr1, w_col_stride1]\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "ldr qW32B, [wptr2, w_col_stride1]\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
-        "ldr qU25B, [uptr1, u_col_stride4]\n"
-        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
-        "ldr qU24B, [uptr1, u_col_stride3]\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, v_col_stride1]\n"
-        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
-        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
-        "str qV32A, [vptr2, v_col_stride1]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "ldr qW11B, [%x[wptr0]], #0x10\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "ldr qU23B, [uptr1, u_col_stride2]\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "ldr qW21B, [wptr1], #0x10\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "ldr qW31B, [wptr2], #0x10\n"
-        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
-        "ldr qU34B, [uptr2, u_col_stride3]\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
-        "ldr qU35B, [uptr2, u_col_stride4]\n"
-        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
-        "str qV31A, [vptr2], #0x10\n"
-
-        // B Part
-        "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
-        "ldr qU33B, [uptr2, u_col_stride2]\n"
-        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
-        "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
-        "ldr qU45B, [uptr3, u_col_stride4]\n"
-        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
-        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
-        "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
-        "ldr qU44B, [uptr3, u_col_stride3]\n"
-        "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
-        "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
-        "ldr qU43B, [uptr3, u_col_stride2]\n"
-        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
-        "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
-        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
-        "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
-        "ldr qU55B, [uptr4, u_col_stride4]\n"
-        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
-        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
-        "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
-        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
-        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
-        "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
-        "ldr qU54B, [uptr4, u_col_stride3]\n"
-        "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
-        "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
-        "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
-        "ldr qU53B, [uptr4, u_col_stride2]\n"
-        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
-        "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
-        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
-        "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
-        "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
-        "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
-        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
-        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
-        "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
-        "str qV13B, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
-        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
-        "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
-        "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
-        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
-        "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
-        "ldr qU22B, [uptr1, u_col_stride1]\n"
-        "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
-        "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
-        "ldr qU32B, [uptr2, u_col_stride1]\n"
-        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
-        "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
-        "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
-        "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
-        "ldr qU42B, [uptr3, u_col_stride1]\n"
-        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
-        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
-        "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
-        "str qV23B, [vptr1, v_col_stride2]\n"
-        "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
-        "ldr qW23A, [wptr1, w_col_stride2]\n"
-        "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
-        "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
-        "ldr qU52B, [uptr4, u_col_stride1]\n"
-        "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
-        "ldr qU11B, [%x[uptr0]], #0x10\n"
-        "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
-        "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
-        "ldr qU21B, [uptr1], #0x10\n"
-        "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
-        "ldr qW33A, [wptr2, w_col_stride2]\n"
-        "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
-        "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
-        "str qV33B, [vptr2, v_col_stride2]\n"
-        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
-        "ldr qU31B, [uptr2], #0x10\n"
-        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
-        "ldr qU41B, [uptr3], #0x10\n"
-        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
-        "ldr qU51B, [uptr4], #0x10\n"
-        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
-        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
-        "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
-        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
-        "ldr qW22A, [wptr1, w_col_stride1]\n"
-        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
-        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
-        "str qV12B, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
-        "ldr qW32A, [wptr2, w_col_stride1]\n"
-        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
-        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
-        "ldr qU25A, [uptr1, u_col_stride4]\n"
-        "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
-        "ldr qU24A, [uptr1, u_col_stride3]\n"
-        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
-        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
-        "str qV22B, [vptr1, v_col_stride1]\n"
-        "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
-        "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
-        "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
-        "subs %x[n_iters], %x[n_iters], #1\n"
-        "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
-        "str qV32B, [vptr2, v_col_stride1]\n"
-        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
-        "ldr qW11A, [%x[wptr0]], #0x10\n"
-        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
-        "ldr qU23A, [uptr1, u_col_stride2]\n"
-        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
-        "ldr qW21A, [wptr1], #0x10\n"
-        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
-        "str qV11B, [%x[vptr0]], #0x10\n"
-        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
-        "ldr qW31A, [wptr2], #0x10\n"
-        "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
-        "ldr qU34A, [uptr2, u_col_stride3]\n"
-        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
-        "str qV21B, [vptr1], #0x10\n"
-        "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
-        "ldr qU35A, [uptr2, u_col_stride4]\n"
-        "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
-        "str qV31B, [vptr2], #0x10\n"
-
-        // First part of A
-        "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
-        "ldr qU33A, [uptr2, u_col_stride2]\n"
-        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
-        "bne 1b\n"  // Loop
-
-        "2:"  // Tail dispatch
-        "cbnz %w[odd_tail], 3f\n"
-
-        // Even tail
-        // A Part
-        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
-        "ldr qU45A, [uptr3, u_col_stride4]\n"
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
-        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
-        "ldr qU55A, [uptr4, u_col_stride4]\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
-        "ldr qU54A, [uptr4, u_col_stride3]\n"
-        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
-        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
-        "ldr qU53A, [uptr4, u_col_stride2]\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
-        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
-        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
-        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
-        "str qV13A, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
-        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
-        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
-        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
-        "ldr qU22A, [uptr1, u_col_stride1]\n"
-        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
-        "ldr qU32A, [uptr2, u_col_stride1]\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
-        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
-        "str qV23A, [vptr1, v_col_stride2]\n"
-        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
-        "ldr qW23B, [wptr1, w_col_stride2]\n"
-        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
-        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
-        "ldr qU52A, [uptr4, u_col_stride1]\n"
-        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
-        "ldr qW33B, [wptr2, w_col_stride2]\n"
-        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
-        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
-        "str qV33A, [vptr2, v_col_stride2]\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU51A, [uptr4], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "ldr qW22B, [wptr1, w_col_stride1]\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "ldr qW32B, [wptr2, w_col_stride1]\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
-        "ldr qU25B, [uptr1, u_col_stride4]\n"
-        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
-        "ldr qU24B, [uptr1, u_col_stride3]\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, v_col_stride1]\n"
-        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
-        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
-        "str qV32A, [vptr2, v_col_stride1]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "ldr qW11B, [%x[wptr0]], #0x10\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "ldr qU23B, [uptr1, u_col_stride2]\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "ldr qW21B, [wptr1], #0x10\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "ldr qW31B, [wptr2], #0x10\n"
-        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
-        "ldr qU34B, [uptr2, u_col_stride3]\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
-        "ldr qU35B, [uptr2, u_col_stride4]\n"
-        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
-        "str qV31A, [vptr2], #0x10\n"
-
-        // B Part
-        "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
-        "ldr qU33B, [uptr2, u_col_stride2]\n"
-        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
-        "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
-        "ldr qU45B, [uptr3, u_col_stride4]\n"
-        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
-        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
-        "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
-        "ldr qU44B, [uptr3, u_col_stride3]\n"
-        "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
-        "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
-        "ldr qU43B, [uptr3, u_col_stride2]\n"
-        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
-        "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
-        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
-        "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
-        "ldr qU55B, [uptr4, u_col_stride4]\n"
-        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
-        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
-        "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
-        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
-        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
-        "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
-        "ldr qU54B, [uptr4, u_col_stride3]\n"
-        "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
-        "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
-        "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
-        "ldr qU53B, [uptr4, u_col_stride2]\n"
-        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
-        "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
-        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
-        "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
-        "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
-        "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
-        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
-        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
-        "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
-        "str qV13B, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
-        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
-        "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
-        "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
-        "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
-        "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
-        "ldr qU22B, [uptr1, u_col_stride1]\n"
-        "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
-        "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
-        "ldr qU32B, [uptr2, u_col_stride1]\n"
-        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
-        "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
-        "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
-        "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
-        "ldr qU42B, [uptr3, u_col_stride1]\n"
-        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
-        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
-        "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
-        "str qV23B, [vptr1, v_col_stride2]\n"
-        "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
-        "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
-        "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
-        "ldr qU52B, [uptr4, u_col_stride1]\n"
-        "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
-        "ldr qU11B, [%x[uptr0]], #0x10\n"
-        "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
-        "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
-        "ldr qU21B, [uptr1], #0x10\n"
-        "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
-        "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
-        "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
-        "str qV33B, [vptr2, v_col_stride2]\n"
-        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
-        "ldr qU31B, [uptr2], #0x10\n"
-        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
-        "ldr qU41B, [uptr3], #0x10\n"
-        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
-        "ldr qU51B, [uptr4], #0x10\n"
-        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
-        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
-        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
-        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
-        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
-        "str qV12B, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
-        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
-        "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
-        "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
-        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
-        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
-        "str qV22B, [vptr1, v_col_stride1]\n"
-        "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
-        "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
-        "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
-        "subs %x[n_iters], %x[n_iters], #1\n"
-        "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
-        "str qV32B, [vptr2, v_col_stride1]\n"
-        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
-        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
-        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
-        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
-        "str qV11B, [%x[vptr0]], #0x10\n"
-        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
-        "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
-        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
-        "str qV21B, [vptr1], #0x10\n"
-        "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
-        "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
-        "str qV31B, [vptr2], #0x10\n"
-
-        "b 4f\n"  // Branch to end of method
-
-        "3:"  // Odd tail, finish off A
-        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
-        "ldr qU45A, [uptr3, u_col_stride4]\n"
-        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
-        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
-        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
-        "ldr qU44A, [uptr3, u_col_stride3]\n"
-        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
-        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
-        "ldr qU43A, [uptr3, u_col_stride2]\n"
-        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
-        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
-        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
-        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
-        "ldr qU55A, [uptr4, u_col_stride4]\n"
-        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
-        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
-        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
-        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
-        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
-        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
-        "ldr qU54A, [uptr4, u_col_stride3]\n"
-        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
-        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
-        "ldr qU53A, [uptr4, u_col_stride2]\n"
-        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
-        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
-        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
-        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
-        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
-        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
-        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
-        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
-        "str qV13A, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
-        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
-        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
-        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
-        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
-        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
-        "ldr qU22A, [uptr1, u_col_stride1]\n"
-        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
-        "ldr qU32A, [uptr2, u_col_stride1]\n"
-        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
-        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
-        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
-        "ldr qU42A, [uptr3, u_col_stride1]\n"
-        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
-        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
-        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
-        "str qV23A, [vptr1, v_col_stride2]\n"
-        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
-        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
-        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
-        "ldr qU52A, [uptr4, u_col_stride1]\n"
-        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
-        "ldr qU11A, [%x[uptr0]], #0x10\n"
-        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
-        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
-        "ldr qU21A, [uptr1], #0x10\n"
-        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
-        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
-        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
-        "str qV33A, [vptr2, v_col_stride2]\n"
-        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
-        "ldr qU31A, [uptr2], #0x10\n"
-        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
-        "ldr qU41A, [uptr3], #0x10\n"
-        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
-        "ldr qU51A, [uptr4], #0x10\n"
-        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
-        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
-        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
-        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
-        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
-        "str qV12A, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
-        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
-        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
-        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
-        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
-        "str qV22A, [vptr1, v_col_stride1]\n"
-        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
-        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
-        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
-        "str qV32A, [vptr2, v_col_stride1]\n"
-        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
-        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
-        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
-        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
-        "str qV11A, [%x[vptr0]], #0x10\n"
-        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
-        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
-        "str qV21A, [vptr1], #0x10\n"
-        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
-        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
-        "str qV31A, [vptr2], #0x10\n"
-
-        "4:"  // End of method
-        ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
-        ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
-        ".unreq u_col_stride3\n" ".unreq u_col_stride4\n"
-        ".unreq wptr1\n" ".unreq wptr2\n"
-        ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
-        ".unreq vptr1\n" ".unreq vptr2\n"
-        ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
-
-        ".unreq qU22B\n" ".unreq qW13B\n" ".unreq qW13A\n" ".unreq qU51B\n"
-        ".unreq qU54B\n" ".unreq qU45A\n" ".unreq qU15A\n" ".unreq qU41B\n"
-        ".unreq qU24B\n" ".unreq qU21A\n"
-        ".unreq qV11B\n" ".unreq qU51A\n" ".unreq qU35A\n" ".unreq qU12A\n"
-        ".unreq qU42B\n" ".unreq qU44B\n" ".unreq qU13B\n" ".unreq qW33A\n"
-        ".unreq qV31B\n" ".unreq qV23A\n" ".unreq qU31A\n" ".unreq qU35B\n" ".unreq qU13A\n"
-        ".unreq qV23B\n" ".unreq qU11A\n" ".unreq qU25A\n" ".unreq qU43A\n" ".unreq qU52B\n"
-        ".unreq qU24A\n" ".unreq qU23B\n" ".unreq qV21A\n" ".unreq qV32B\n"
-        ".unreq qV33B\n" ".unreq qW11A\n" ".unreq qU31B\n"
-        ".unreq qW12B\n" ".unreq qU33A\n" ".unreq qU14A\n" ".unreq qU22A\n"
-        ".unreq qU25B\n" ".unreq qU53B\n" ".unreq qU42A\n" ".unreq qU44A\n"
-        ".unreq qU43B\n" ".unreq qW31A\n" ".unreq qU11B\n"
-        ".unreq qW11B\n" ".unreq qW32A\n"
-        ".unreq qU12B\n" ".unreq qU34B\n" ".unreq qW21A\n"
-        ".unreq qU14B\n" ".unreq qV21B\n" ".unreq qW22A\n"
-        ".unreq qW23B\n" ".unreq qW23A\n" ".unreq qU21B\n"
-        ".unreq qU32B\n" ".unreq qU34A\n" ".unreq qU45B\n" ".unreq qV31A\n"
-        ".unreq qW12A\n" ".unreq qU33B\n" ".unreq qU15B\n"
-        ".unreq qW33B\n" ".unreq qU54A\n" ".unreq qU23A\n"
-        ".unreq qW32B\n" ".unreq qV33A\n" ".unreq qW31B\n" ".unreq qV12A\n"
-        ".unreq qV12B\n" ".unreq qU41A\n" ".unreq qU53A\n"
-        ".unreq qV13A\n" ".unreq qU32A\n" ".unreq qW22B\n"
-        ".unreq qV22B\n" ".unreq qU52A\n" ".unreq qV13B\n" ".unreq qV32A\n"
-        ".unreq qU55A\n" ".unreq qU55B\n" ".unreq qV22A\n" ".unreq qW21B\n"
-        ".unreq qV11A\n"
-        ".unreq vU22B\n" ".unreq vW13B\n" ".unreq vW13A\n" ".unreq vU51B\n"
-        ".unreq vU54B\n" ".unreq vU45A\n" ".unreq vU15A\n" ".unreq vU41B\n"
-        ".unreq vU24B\n" ".unreq vU21A\n"
-        ".unreq vV11B\n" ".unreq vU51A\n" ".unreq vU35A\n" ".unreq vU12A\n"
-        ".unreq vU42B\n" ".unreq vU44B\n" ".unreq vU13B\n" ".unreq vW33A\n"
-        ".unreq vV31B\n" ".unreq vV23A\n" ".unreq vU31A\n" ".unreq vU35B\n" ".unreq vU13A\n"
-        ".unreq vV23B\n" ".unreq vU11A\n" ".unreq vU25A\n" ".unreq vU43A\n" ".unreq vU52B\n"
-        ".unreq vU24A\n" ".unreq vU23B\n" ".unreq vV21A\n" ".unreq vV32B\n"
-        ".unreq vV33B\n" ".unreq vW11A\n" ".unreq vU31B\n"
-        ".unreq vW12B\n" ".unreq vU33A\n" ".unreq vU14A\n" ".unreq vU22A\n"
-        ".unreq vU25B\n" ".unreq vU53B\n" ".unreq vU42A\n" ".unreq vU44A\n"
-        ".unreq vU43B\n" ".unreq vW31A\n" ".unreq vU11B\n"
-        ".unreq vW11B\n" ".unreq vW32A\n"
-        ".unreq vU12B\n" ".unreq vU34B\n" ".unreq vW21A\n"
-        ".unreq vU14B\n" ".unreq vV21B\n" ".unreq vW22A\n"
-        ".unreq vW23B\n" ".unreq vW23A\n" ".unreq vU21B\n"
-        ".unreq vU32B\n" ".unreq vU34A\n" ".unreq vU45B\n" ".unreq vV31A\n"
-        ".unreq vW12A\n" ".unreq vU33B\n" ".unreq vU15B\n"
-        ".unreq vW33B\n" ".unreq vU54A\n" ".unreq vU23A\n"
-        ".unreq vW32B\n" ".unreq vV33A\n" ".unreq vW31B\n" ".unreq vV12A\n"
-        ".unreq vV12B\n" ".unreq vU41A\n" ".unreq vU53A\n"
-        ".unreq vV13A\n" ".unreq vU32A\n" ".unreq vW22B\n"
-        ".unreq vV22B\n" ".unreq vU52A\n" ".unreq vV13B\n" ".unreq vV32A\n"
-        ".unreq vU55A\n" ".unreq vU55B\n" ".unreq vV22A\n" ".unreq vW21B\n"
-        ".unreq vV11A\n"
-        : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
-          [n_iters] "+r" (n_iters)
-        : [u_row_stride] "r" (in_row_stride * sizeof(float)),
-          [u_col_stride] "r" (in_col_stride * sizeof(float)),
-          [w_row_stride] "r" (weight_row_stride * sizeof(float)),
-          [w_col_stride] "r" (weight_col_stride * sizeof(float)),
-          [v_row_stride] "r" (out_row_stride * sizeof(float)),
-          [v_col_stride] "r" (out_col_stride * sizeof(float)),
-          [odd_tail] "r" (odd_tail)
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-          "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-          "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
-          "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
-          "x12", "cc", "memory"
-    );
-  }
-  if (channels_remaining)
-  {
-    // Fall back on the unoptimised version to clean up the tail
-    ConvImpl::process_tile<false>(
-        channels_remaining,
-        wptr0, weight_row_stride, weight_col_stride,
-        uptr0, in_row_stride, in_col_stride,
-        vptr0, out_row_stride, out_col_stride,
-        0, 0, 0, 0, 0, 0
-    );
-  }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x17, %[inptr0], %[input_row_stride]\n"
+    "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x25, %[outptr0], %[output_row_stride]\n"
+    "add x14, x17, %[input_row_stride]\n"
+    "add x22, x18, #64\n"
+    "add x15, x18, %[input_col_stride1]\n"
+    "add x21, x14, %[input_row_stride]\n"
+    "add x16, x15, #64\n"
+    "add x24, x15, %[input_col_stride1]\n"
+    "add x26, x21, %[input_row_stride]\n"
+    "add x23, x24, #64\n"
+    "add x13, x25, %[output_row_stride]\n"
+    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x19, %[n_channels], #3\n"
+    "lsr x20, %[n_channels], #2\n"
+    "cbz x20, 4f\n"
+    "1:\n"
+    "ldr q19, [%[wbptr]]\n"
+    "subs x20, x20, #1\n"
+    "mov v8.16b, v19.16b\n"
+    "ldr q17, [%[wbptr], #16]\n"
+    "mov v5.16b, v19.16b\n"
+    "ldr q16, [%[wbptr], #32]\n"
+    "mov v7.16b, v19.16b\n"
+    "ldr q15, [%[wbptr], #48]\n"
+    "mov v2.16b, v19.16b\n"
+    "ldr q14, [%[wbptr], #64]\n"
+    "mov v4.16b, v19.16b\n"
+    "ldr q13, [%[wbptr], #80]\n"
+    "mov v6.16b, v19.16b\n"
+    "ldr q12, [%[wbptr], #96]\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr q11, [%[wbptr], #112]\n"
+    "mov v3.16b, v19.16b\n"
+    "ldr q10, [%[wbptr], #128]\n"
+    "mov v0.16b, v19.16b\n"
+    "ldr q9, [%[wbptr], #144]\n"
+    "ldr q25, [%[inptr0]]\n"
+    "ldr q27, [x17]\n"
+    "fmla v8.4s, v25.4s, v17.4s\n"
+    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q20, [x14]\n"
+    "ldr q22, [x17, %[input_col_stride1]]\n"
+    "ldr q28, [%[inptr0], x18]\n"
+    "ldr q23, [x21]\n"
+    "fmla v8.4s, v27.4s, v14.4s\n"
+    "ldr q18, [x14, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x17, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "prfm pldl1keep, [x14, #64]\n"
+    "prfm pldl1keep, [x17, x28]\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "prfm pldl1keep, [x14, x28]\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v5.4s, v27.4s, v17.4s\n"
+    "ldr q27, [x17, x18]\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "ldr q30, [%[inptr0], x15]\n"
+    "fmla v7.4s, v26.4s, v17.4s\n"
+    "ldr q31, [x26]\n"
+    "fmla v5.4s, v20.4s, v14.4s\n"
+    "ldr q24, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x17, x22]\n"
+    "fmla v2.4s, v20.4s, v17.4s\n"
+    "ldr q29, [x14, x18]\n"
+    "fmla v5.4s, v22.4s, v16.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v7.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x21, x28]\n"
+    "fmla v4.4s, v22.4s, v17.4s\n"
+    "ldr q21, [x17, x15]\n"
+    "fmla v8.4s, v28.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x22]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x16]\n"
+    "fmla v6.4s, v28.4s, v17.4s\n"
+    "ldr q19, [%[inptr0], x24]\n"
+    "fmla v5.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "ldr q28, [x26, %[input_col_stride1]]\n"
+    "fmla v8.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x26, x28]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "prfm pldl1keep, [x21, x22]\n"
+    "fmla v7.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x14, x16]\n"
+    "fmla v2.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x23]\n"
+    "fmla v4.4s, v18.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x22]\n"
+    "fmla v1.4s, v18.4s, v17.4s\n"
+    "ldr q25, [x21, x18]\n"
+    "fmla v8.4s, v27.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v5.4s, v27.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "fmla v7.4s, v27.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "fmla v4.4s, v27.4s, v16.4s\n"
+    "prfm pldl1keep, [x21, x23]\n"
+    "fmla v6.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "fmla v3.4s, v27.4s, v17.4s\n"
+    "ldr q27, [x14, x15]\n"
+    "fmla v7.4s, v30.4s, v15.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v6.4s, v30.4s, v16.4s\n"
+    "ldr q26, [x17, x24]\n"
+    "fmla v2.4s, v31.4s, v11.4s\n"
+    "ldr q20, [x26, x18]\n"
+    "fmla v5.4s, v24.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v4.4s, v24.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v2.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v1.4s, v24.4s, v14.4s\n"
+    "ldr q18, [x21, x15]\n"
+    "fmla v8.4s, v29.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "fmla v5.4s, v29.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v7.4s, v29.4s, v10.4s\n"
+    "add x17, x17, #16\n"
+    "fmla v2.4s, v29.4s, v15.4s\n"
+    "prfm pldl1keep, [x17, #64]\n"
+    "fmla v4.4s, v29.4s, v13.4s\n"
+    "prfm pldl1keep, [x17, x28]\n"
+    "fmla v6.4s, v29.4s, v11.4s\n"
+    "subs x20, x20, #1\n"
+    "fmla v1.4s, v29.4s, v16.4s\n"
+    "fmla v3.4s, v29.4s, v14.4s\n"
+    "fmla v0.4s, v29.4s, v17.4s\n"
+    "ldr q22, [x14, x24]\n"
+    "fmla v7.4s, v21.4s, v12.4s\n"
+    "ldr q23, [x26, x15]\n"
+    "fmla v4.4s, v21.4s, v15.4s\n"
+    "add x14, x14, #16\n"
+    "fmla v6.4s, v21.4s, v13.4s\n"
+    "prfm pldl1keep, [x14, #64]\n"
+    "fmla v3.4s, v21.4s, v16.4s\n"
+    "ldr q24, [x21, x24]\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "prfm pldl1keep, [x14, x28]\n"
+    "fmla v6.4s, v19.4s, v15.4s\n"
+    "ldr q21, [x26, x24]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "ldr q19, [%[wbptr]]\n"
+    "fmla v5.4s, v25.4s, v9.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v2.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v1.4s, v25.4s, v13.4s\n"
+    "fmla v3.4s, v25.4s, v11.4s\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "ldr q17, [%[wbptr], #16]\n"
+    "fmla v7.4s, v27.4s, v9.4s\n"
+    "ldr q25, [%[inptr0]]\n"
+    "fmla v4.4s, v27.4s, v12.4s\n"
+    "fmla v6.4s, v27.4s, v10.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v3.4s, v27.4s, v13.4s\n"
+    "fmla v0.4s, v27.4s, v16.4s\n"
+    "ldr q14, [%[wbptr], #64]\n"
+    "fmla v6.4s, v26.4s, v12.4s\n"
+    "ldr q27, [x17]\n"
+    "fmla v3.4s, v26.4s, v15.4s\n"
+    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v20.4s, v9.4s\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v0.4s, v20.4s, v11.4s\n"
+    "ldr q16, [%[wbptr], #32]\n"
+    "fmla v4.4s, v18.4s, v9.4s\n"
+    "ldr q20, [x14]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v18.4s, v13.4s\n"
+    "ldr q11, [%[wbptr], #112]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "movi v30.16b, #0\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v0.4s, v22.4s, v15.4s\n"
+    "ldr q13, [%[wbptr], #80]\n"
+    "fmov v29.4s, #6.0\n"
+    "fmax v8.4s, v8.4s, v30.4s\n"
+    "fmla v3.4s, v24.4s, v9.4s\n"
+    "fmax v7.4s, v7.4s, v30.4s\n"
+    "fmla v0.4s, v23.4s, v10.4s\n"
+    "ldr q15, [%[wbptr], #48]\n"
+    "fmin v8.4s, v8.4s, v29.4s\n"
+    "ldr q22, [x17, %[input_col_stride1]]\n"
+    "fmin v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v30.4s\n"
+    "str q8, [%[outptr0]]\n"
+    "fmla v0.4s, v24.4s, v12.4s\n"
+    "str q7, [%[outptr0], %[output_col_stride1]]\n"
+    "fmin v6.4s, v6.4s, v29.4s\n"
+    "fmax v5.4s, v5.4s, v30.4s\n"
+    "ldr q10, [%[wbptr], #128]\n"
+    "str q6, [%[outptr0], x27]\n"
+    "fmla v0.4s, v21.4s, v9.4s\n"
+    "fmin v5.4s, v5.4s, v29.4s\n"
+    "ldr q12, [%[wbptr], #96]\n"
+    "fmax v4.4s, v4.4s, v30.4s\n"
+    "ldr q28, [%[inptr0], x18]\n"
+    "str q5, [x25]\n"
+    "fmax v3.4s, v3.4s, v30.4s\n"
+    "fmin v4.4s, v4.4s, v29.4s\n"
+    "ldr q9, [%[wbptr], #144]\n"
+    "fmin v3.4s, v3.4s, v29.4s\n"
+    "ldr q23, [x21]\n"
+    "str q4, [x25, %[output_col_stride1]]\n"
+    "fmax v2.4s, v2.4s, v30.4s\n"
+    "str q3, [x25, x27]\n"
+    "fmax v1.4s, v1.4s, v30.4s\n"
+    "fmin v2.4s, v2.4s, v29.4s\n"
+    "ldr q18, [x14, %[input_col_stride1]]\n"
+    "fmin v1.4s, v1.4s, v29.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "str q2, [x13]\n"
+    "fmax v0.4s, v0.4s, v30.4s\n"
+    "str q1, [x13, %[output_col_stride1]]\n"
+    "mov v8.16b, v19.16b\n"
+    "fmin v0.4s, v0.4s, v29.4s\n"
+    "add x25, x25, #16\n"
+    "mov v5.16b, v19.16b\n"
+    "mov v7.16b, v19.16b\n"
+    "str q0, [x13, x27]\n"
+    "mov v2.16b, v19.16b\n"
+    "mov v4.16b, v19.16b\n"
+    "add x13, x13, #16\n"
+    "mov v6.16b, v19.16b\n"
+    "mov v1.16b, v19.16b\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v0.16b, v19.16b\n"
+    "fmla v8.4s, v25.4s, v17.4s\n"
+    "fmla v8.4s, v27.4s, v14.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v5.4s, v27.4s, v17.4s\n"
+    "ldr q27, [x17, x18]\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "ldr q30, [%[inptr0], x15]\n"
+    "fmla v7.4s, v26.4s, v17.4s\n"
+    "ldr q31, [x26]\n"
+    "fmla v5.4s, v20.4s, v14.4s\n"
+    "ldr q24, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x17, x22]\n"
+    "fmla v2.4s, v20.4s, v17.4s\n"
+    "ldr q29, [x14, x18]\n"
+    "fmla v5.4s, v22.4s, v16.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v7.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x21, x28]\n"
+    "fmla v4.4s, v22.4s, v17.4s\n"
+    "ldr q21, [x17, x15]\n"
+    "fmla v8.4s, v28.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x22]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x16]\n"
+    "fmla v6.4s, v28.4s, v17.4s\n"
+    "ldr q19, [%[inptr0], x24]\n"
+    "fmla v5.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "ldr q28, [x26, %[input_col_stride1]]\n"
+    "fmla v8.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x26, x28]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "prfm pldl1keep, [x21, x22]\n"
+    "fmla v7.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x14, x16]\n"
+    "fmla v2.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x23]\n"
+    "fmla v4.4s, v18.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x22]\n"
+    "fmla v1.4s, v18.4s, v17.4s\n"
+    "ldr q25, [x21, x18]\n"
+    "fmla v8.4s, v27.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v5.4s, v27.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "fmla v7.4s, v27.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "fmla v4.4s, v27.4s, v16.4s\n"
+    "prfm pldl1keep, [x21, x23]\n"
+    "fmla v6.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "fmla v3.4s, v27.4s, v17.4s\n"
+    "ldr q27, [x14, x15]\n"
+    "fmla v7.4s, v30.4s, v15.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v6.4s, v30.4s, v16.4s\n"
+    "ldr q26, [x17, x24]\n"
+    "fmla v2.4s, v31.4s, v11.4s\n"
+    "ldr q20, [x26, x18]\n"
+    "fmla v5.4s, v24.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v4.4s, v24.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v2.4s, v24.4s, v13.4s\n"
+    "add x17, x17, #16\n"
+    "fmla v1.4s, v24.4s, v14.4s\n"
+    "ldr q18, [x21, x15]\n"
+    "fmla v8.4s, v29.4s, v9.4s\n"
+    "fmla v5.4s, v29.4s, v12.4s\n"
+    "fmla v7.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v15.4s\n"
+    "fmla v4.4s, v29.4s, v13.4s\n"
+    "fmla v6.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v16.4s\n"
+    "fmla v3.4s, v29.4s, v14.4s\n"
+    "fmla v0.4s, v29.4s, v17.4s\n"
+    "ldr q22, [x14, x24]\n"
+    "fmla v7.4s, v21.4s, v12.4s\n"
+    "ldr q23, [x26, x15]\n"
+    "fmla v4.4s, v21.4s, v15.4s\n"
+    "add x14, x14, #16\n"
+    "fmla v6.4s, v21.4s, v13.4s\n"
+    "fmla v3.4s, v21.4s, v16.4s\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "ldr q24, [x21, x24]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "ldr q21, [x26, x24]\n"
+    "fmla v6.4s, v19.4s, v15.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v5.4s, v25.4s, v9.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v2.4s, v25.4s, v12.4s\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v13.4s\n"
+    "fmla v3.4s, v25.4s, v11.4s\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "fmla v7.4s, v27.4s, v9.4s\n"
+    "fmla v4.4s, v27.4s, v12.4s\n"
+    "fmla v6.4s, v27.4s, v10.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v3.4s, v27.4s, v13.4s\n"
+    "fmla v0.4s, v27.4s, v16.4s\n"
+    "fmla v2.4s, v20.4s, v9.4s\n"
+    "fmla v6.4s, v26.4s, v12.4s\n"
+    "fmla v4.4s, v18.4s, v9.4s\n"
+    "fmla v3.4s, v26.4s, v15.4s\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v0.4s, v20.4s, v11.4s\n"
+    "movi v30.16b, #0\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "fmov v29.4s, #6.0\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v18.4s, v13.4s\n"
+    "fmax v8.4s, v8.4s, v30.4s\n"
+    "fmax v7.4s, v7.4s, v30.4s\n"
+    "fmax v6.4s, v6.4s, v30.4s\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v0.4s, v22.4s, v15.4s\n"
+    "fmin v8.4s, v8.4s, v29.4s\n"
+    "fmin v7.4s, v7.4s, v29.4s\n"
+    "fmin v6.4s, v6.4s, v29.4s\n"
+    "str q8, [%[outptr0]]\n"
+    "fmla v3.4s, v24.4s, v9.4s\n"
+    "str q7, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v23.4s, v10.4s\n"
+    "str q6, [%[outptr0], x27]\n"
+    "fmax v5.4s, v5.4s, v30.4s\n"
+    "fmax v4.4s, v4.4s, v30.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v0.4s, v24.4s, v12.4s\n"
+    "fmin v5.4s, v5.4s, v29.4s\n"
+    "fmin v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v30.4s\n"
+    "str q5, [x25]\n"
+    "fmax v2.4s, v2.4s, v30.4s\n"
+    "str q4, [x25, %[output_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v9.4s\n"
+    "fmin v3.4s, v3.4s, v29.4s\n"
+    "fmin v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v30.4s\n"
+    "str q3, [x25, x27]\n"
+    "str q2, [x13]\n"
+    "fmin v1.4s, v1.4s, v29.4s\n"
+    "fmax v0.4s, v0.4s, v30.4s\n"
+    "add x25, x25, #16\n"
+    "str q1, [x13, %[output_col_stride1]]\n"
+    "fmin v0.4s, v0.4s, v29.4s\n"
+    "str q0, [x13, x27]\n"
+    "add x13, x13, #16\n"
+    "4:\n"
+    "cbz x19, 7f\n"
+    "ldr s19, [%[wbptr]]\n"
+    "mov v8.16b, v19.16b\n"
+    "ldr s17, [%[wbptr], #4]\n"
+    "mov v5.16b, v19.16b\n"
+    "ldr s16, [%[wbptr], #8]\n"
+    "mov v7.16b, v19.16b\n"
+    "ldr s15, [%[wbptr], #12]\n"
+    "mov v2.16b, v19.16b\n"
+    "ldr s14, [%[wbptr], #16]\n"
+    "mov v4.16b, v19.16b\n"
+    "ldr s13, [%[wbptr], #20]\n"
+    "mov v6.16b, v19.16b\n"
+    "ldr s12, [%[wbptr], #24]\n"
+    "mov v1.16b, v19.16b\n"
+    "ldr s11, [%[wbptr], #28]\n"
+    "mov v3.16b, v19.16b\n"
+    "ldr s10, [%[wbptr], #32]\n"
+    "mov v0.16b, v19.16b\n"
+    "ldr s9, [%[wbptr], #36]\n"
+    "ldr s25, [%[inptr0]]\n"
+    "subs x19, x19, #1\n"
+    "fmla v8.4s, v25.4s, v17.4s\n"
+    "ldr s27, [x17]\n"
+    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s20, [x14]\n"
+    "ldr s22, [x17, %[input_col_stride1]]\n"
+    "ldr s28, [%[inptr0], x18]\n"
+    "fmla v8.4s, v27.4s, v14.4s\n"
+    "ldr s23, [x21]\n"
+    "ldr s18, [x14, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x17, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "prfm pldl1keep, [x14, #64]\n"
+    "prfm pldl1keep, [x17, x28]\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "prfm pldl1keep, [x14, x28]\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v5.4s, v27.4s, v17.4s\n"
+    "ldr s27, [x17, x18]\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "ldr s30, [%[inptr0], x15]\n"
+    "fmla v7.4s, v26.4s, v17.4s\n"
+    "ldr s31, [x26]\n"
+    "fmla v5.4s, v20.4s, v14.4s\n"
+    "ldr s24, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x17, x22]\n"
+    "fmla v2.4s, v20.4s, v17.4s\n"
+    "ldr s29, [x14, x18]\n"
+    "fmla v5.4s, v22.4s, v16.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v7.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x21, x28]\n"
+    "fmla v4.4s, v22.4s, v17.4s\n"
+    "ldr s21, [x17, x15]\n"
+    "fmla v8.4s, v28.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x22]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x16]\n"
+    "fmla v6.4s, v28.4s, v17.4s\n"
+    "ldr s19, [%[inptr0], x24]\n"
+    "fmla v5.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "ldr s28, [x26, %[input_col_stride1]]\n"
+    "fmla v8.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x26, x28]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "prfm pldl1keep, [x21, x22]\n"
+    "fmla v7.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x14, x16]\n"
+    "fmla v2.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x23]\n"
+    "fmla v4.4s, v18.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x22]\n"
+    "fmla v1.4s, v18.4s, v17.4s\n"
+    "ldr s25, [x21, x18]\n"
+    "fmla v8.4s, v27.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v5.4s, v27.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "fmla v7.4s, v27.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "fmla v4.4s, v27.4s, v16.4s\n"
+    "prfm pldl1keep, [x21, x23]\n"
+    "fmla v6.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "fmla v3.4s, v27.4s, v17.4s\n"
+    "ldr s27, [x14, x15]\n"
+    "fmla v7.4s, v30.4s, v15.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v6.4s, v30.4s, v16.4s\n"
+    "ldr s26, [x17, x24]\n"
+    "fmla v2.4s, v31.4s, v11.4s\n"
+    "ldr s20, [x26, x18]\n"
+    "fmla v5.4s, v24.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v4.4s, v24.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v2.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v1.4s, v24.4s, v14.4s\n"
+    "ldr s18, [x21, x15]\n"
+    "fmla v8.4s, v29.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "fmla v5.4s, v29.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v7.4s, v29.4s, v10.4s\n"
+    "add x17, x17, #4\n"
+    "fmla v2.4s, v29.4s, v15.4s\n"
+    "prfm pldl1keep, [x17, #64]\n"
+    "fmla v4.4s, v29.4s, v13.4s\n"
+    "prfm pldl1keep, [x17, x28]\n"
+    "fmla v6.4s, v29.4s, v11.4s\n"
+    "subs x19, x19, #1\n"
+    "fmla v1.4s, v29.4s, v16.4s\n"
+    "fmla v3.4s, v29.4s, v14.4s\n"
+    "fmla v0.4s, v29.4s, v17.4s\n"
+    "ldr s22, [x14, x24]\n"
+    "fmla v7.4s, v21.4s, v12.4s\n"
+    "ldr s23, [x26, x15]\n"
+    "fmla v4.4s, v21.4s, v15.4s\n"
+    "add x14, x14, #4\n"
+    "fmla v6.4s, v21.4s, v13.4s\n"
+    "prfm pldl1keep, [x14, #64]\n"
+    "fmla v3.4s, v21.4s, v16.4s\n"
+    "ldr s24, [x21, x24]\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "prfm pldl1keep, [x14, x28]\n"
+    "fmla v6.4s, v19.4s, v15.4s\n"
+    "ldr s21, [x26, x24]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "ldr s19, [%[wbptr]]\n"
+    "fmla v5.4s, v25.4s, v9.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v2.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, #64]\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v1.4s, v25.4s, v13.4s\n"
+    "fmla v3.4s, v25.4s, v11.4s\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "ldr s17, [%[wbptr], #4]\n"
+    "fmla v7.4s, v27.4s, v9.4s\n"
+    "ldr s25, [%[inptr0]]\n"
+    "fmla v4.4s, v27.4s, v12.4s\n"
+    "fmla v6.4s, v27.4s, v10.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v3.4s, v27.4s, v13.4s\n"
+    "fmla v0.4s, v27.4s, v16.4s\n"
+    "ldr s14, [%[wbptr], #16]\n"
+    "fmla v6.4s, v26.4s, v12.4s\n"
+    "ldr s27, [x17]\n"
+    "fmla v3.4s, v26.4s, v15.4s\n"
+    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v2.4s, v20.4s, v9.4s\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v0.4s, v20.4s, v11.4s\n"
+    "ldr s16, [%[wbptr], #8]\n"
+    "fmla v4.4s, v18.4s, v9.4s\n"
+    "ldr s20, [x14]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v18.4s, v13.4s\n"
+    "ldr s11, [%[wbptr], #28]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "movi v30.16b, #0\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v0.4s, v22.4s, v15.4s\n"
+    "ldr s13, [%[wbptr], #20]\n"
+    "fmov v29.4s, #6.0\n"
+    "fmax v8.4s, v8.4s, v30.4s\n"
+    "fmla v3.4s, v24.4s, v9.4s\n"
+    "fmax v7.4s, v7.4s, v30.4s\n"
+    "fmla v0.4s, v23.4s, v10.4s\n"
+    "ldr s15, [%[wbptr], #12]\n"
+    "fmin v8.4s, v8.4s, v29.4s\n"
+    "ldr s22, [x17, %[input_col_stride1]]\n"
+    "fmin v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v30.4s\n"
+    "str s8, [%[outptr0]]\n"
+    "fmla v0.4s, v24.4s, v12.4s\n"
+    "str s7, [%[outptr0], %[output_col_stride1]]\n"
+    "fmin v6.4s, v6.4s, v29.4s\n"
+    "fmax v5.4s, v5.4s, v30.4s\n"
+    "ldr s10, [%[wbptr], #32]\n"
+    "str s6, [%[outptr0], x27]\n"
+    "fmla v0.4s, v21.4s, v9.4s\n"
+    "fmin v5.4s, v5.4s, v29.4s\n"
+    "ldr s12, [%[wbptr], #24]\n"
+    "fmax v4.4s, v4.4s, v30.4s\n"
+    "ldr s28, [%[inptr0], x18]\n"
+    "str s5, [x25]\n"
+    "fmax v3.4s, v3.4s, v30.4s\n"
+    "fmin v4.4s, v4.4s, v29.4s\n"
+    "ldr s9, [%[wbptr], #36]\n"
+    "fmin v3.4s, v3.4s, v29.4s\n"
+    "ldr s23, [x21]\n"
+    "str s4, [x25, %[output_col_stride1]]\n"
+    "fmax v2.4s, v2.4s, v30.4s\n"
+    "str s3, [x25, x27]\n"
+    "fmax v1.4s, v1.4s, v30.4s\n"
+    "fmin v2.4s, v2.4s, v29.4s\n"
+    "ldr s18, [x14, %[input_col_stride1]]\n"
+    "fmin v1.4s, v1.4s, v29.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "str s2, [x13]\n"
+    "fmax v0.4s, v0.4s, v30.4s\n"
+    "str s1, [x13, %[output_col_stride1]]\n"
+    "mov v8.16b, v19.16b\n"
+    "fmin v0.4s, v0.4s, v29.4s\n"
+    "add x25, x25, #4\n"
+    "mov v5.16b, v19.16b\n"
+    "mov v7.16b, v19.16b\n"
+    "str s0, [x13, x27]\n"
+    "mov v2.16b, v19.16b\n"
+    "mov v4.16b, v19.16b\n"
+    "add x13, x13, #4\n"
+    "mov v6.16b, v19.16b\n"
+    "mov v1.16b, v19.16b\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v0.16b, v19.16b\n"
+    "fmla v8.4s, v25.4s, v17.4s\n"
+    "fmla v8.4s, v27.4s, v14.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v5.4s, v27.4s, v17.4s\n"
+    "ldr s27, [x17, x18]\n"
+    "fmla v8.4s, v26.4s, v16.4s\n"
+    "ldr s30, [%[inptr0], x15]\n"
+    "fmla v7.4s, v26.4s, v17.4s\n"
+    "ldr s31, [x26]\n"
+    "fmla v5.4s, v20.4s, v14.4s\n"
+    "ldr s24, [x21, %[input_col_stride1]]\n"
+    "fmla v8.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x17, x22]\n"
+    "fmla v2.4s, v20.4s, v17.4s\n"
+    "ldr s29, [x14, x18]\n"
+    "fmla v5.4s, v22.4s, v16.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v8.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v7.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x21, x28]\n"
+    "fmla v4.4s, v22.4s, v17.4s\n"
+    "ldr s21, [x17, x15]\n"
+    "fmla v8.4s, v28.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x22]\n"
+    "fmla v7.4s, v28.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x16]\n"
+    "fmla v6.4s, v28.4s, v17.4s\n"
+    "ldr s19, [%[inptr0], x24]\n"
+    "fmla v5.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [%[inptr0], x23]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "ldr s28, [x26, %[input_col_stride1]]\n"
+    "fmla v8.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x26, x28]\n"
+    "fmla v5.4s, v18.4s, v13.4s\n"
+    "prfm pldl1keep, [x21, x22]\n"
+    "fmla v7.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x14, x16]\n"
+    "fmla v2.4s, v18.4s, v16.4s\n"
+    "prfm pldl1keep, [x17, x23]\n"
+    "fmla v4.4s, v18.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x22]\n"
+    "fmla v1.4s, v18.4s, v17.4s\n"
+    "ldr s25, [x21, x18]\n"
+    "fmla v8.4s, v27.4s, v12.4s\n"
+    "prfm pldl1keep, [x21, x16]\n"
+    "fmla v5.4s, v27.4s, v15.4s\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "fmla v7.4s, v27.4s, v13.4s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "fmla v4.4s, v27.4s, v16.4s\n"
+    "prfm pldl1keep, [x21, x23]\n"
+    "fmla v6.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "fmla v3.4s, v27.4s, v17.4s\n"
+    "ldr s27, [x14, x15]\n"
+    "fmla v7.4s, v30.4s, v15.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v6.4s, v30.4s, v16.4s\n"
+    "ldr s26, [x17, x24]\n"
+    "fmla v2.4s, v31.4s, v11.4s\n"
+    "ldr s20, [x26, x18]\n"
+    "fmla v5.4s, v24.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v4.4s, v24.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v2.4s, v24.4s, v13.4s\n"
+    "add x17, x17, #4\n"
+    "fmla v1.4s, v24.4s, v14.4s\n"
+    "ldr s18, [x21, x15]\n"
+    "fmla v8.4s, v29.4s, v9.4s\n"
+    "fmla v5.4s, v29.4s, v12.4s\n"
+    "fmla v7.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v15.4s\n"
+    "fmla v4.4s, v29.4s, v13.4s\n"
+    "fmla v6.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v16.4s\n"
+    "fmla v3.4s, v29.4s, v14.4s\n"
+    "fmla v0.4s, v29.4s, v17.4s\n"
+    "ldr s22, [x14, x24]\n"
+    "fmla v7.4s, v21.4s, v12.4s\n"
+    "ldr s23, [x26, x15]\n"
+    "fmla v4.4s, v21.4s, v15.4s\n"
+    "add x14, x14, #4\n"
+    "fmla v6.4s, v21.4s, v13.4s\n"
+    "fmla v3.4s, v21.4s, v16.4s\n"
+    "fmla v2.4s, v28.4s, v10.4s\n"
+    "ldr s24, [x21, x24]\n"
+    "fmla v1.4s, v28.4s, v11.4s\n"
+    "ldr s21, [x26, x24]\n"
+    "fmla v6.4s, v19.4s, v15.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v5.4s, v25.4s, v9.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v2.4s, v25.4s, v12.4s\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v13.4s\n"
+    "fmla v3.4s, v25.4s, v11.4s\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "fmla v7.4s, v27.4s, v9.4s\n"
+    "fmla v4.4s, v27.4s, v12.4s\n"
+    "fmla v6.4s, v27.4s, v10.4s\n"
+    "fmla v1.4s, v27.4s, v15.4s\n"
+    "fmla v3.4s, v27.4s, v13.4s\n"
+    "fmla v0.4s, v27.4s, v16.4s\n"
+    "fmla v2.4s, v20.4s, v9.4s\n"
+    "fmla v6.4s, v26.4s, v12.4s\n"
+    "fmla v4.4s, v18.4s, v9.4s\n"
+    "fmla v3.4s, v26.4s, v15.4s\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v0.4s, v20.4s, v11.4s\n"
+    "movi v30.16b, #0\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "fmov v29.4s, #6.0\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v18.4s, v13.4s\n"
+    "fmax v8.4s, v8.4s, v30.4s\n"
+    "fmax v7.4s, v7.4s, v30.4s\n"
+    "fmax v6.4s, v6.4s, v30.4s\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v0.4s, v22.4s, v15.4s\n"
+    "fmin v8.4s, v8.4s, v29.4s\n"
+    "fmin v7.4s, v7.4s, v29.4s\n"
+    "fmin v6.4s, v6.4s, v29.4s\n"
+    "str s8, [%[outptr0]]\n"
+    "fmla v3.4s, v24.4s, v9.4s\n"
+    "str s7, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v0.4s, v23.4s, v10.4s\n"
+    "str s6, [%[outptr0], x27]\n"
+    "fmax v5.4s, v5.4s, v30.4s\n"
+    "fmax v4.4s, v4.4s, v30.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v0.4s, v24.4s, v12.4s\n"
+    "fmin v5.4s, v5.4s, v29.4s\n"
+    "fmin v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v30.4s\n"
+    "str s5, [x25]\n"
+    "fmax v2.4s, v2.4s, v30.4s\n"
+    "str s4, [x25, %[output_col_stride1]]\n"
+    "fmla v0.4s, v21.4s, v9.4s\n"
+    "fmin v3.4s, v3.4s, v29.4s\n"
+    "fmin v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v30.4s\n"
+    "str s3, [x25, x27]\n"
+    "str s2, [x13]\n"
+    "fmin v1.4s, v1.4s, v29.4s\n"
+    "fmax v0.4s, v0.4s, v30.4s\n"
+    "add x25, x25, #4\n"
+    "str s1, [x13, %[output_col_stride1]]\n"
+    "fmin v0.4s, v0.4s, v29.4s\n"
+    "str s0, [x13, x27]\n"
+    "add x13, x13, #4\n"
+    "7:\n"
+    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+    : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
 }
 
 #endif  // __aarch64__
 
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
 
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp
new file mode 100644
index 0000000..8348692
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
+}  // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index c7113d0..adc6969 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,599 +25,745 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
 
 #ifdef __aarch64__
-
 template <>
 template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
-  const int n_channels,
-  const float* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void* weight_bias_ptr,
+  const float* input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float* output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
 )
 {
-  // Copy pointers
-  const float *uptr0 = inptr;
-  const float *wptr0 = weights;
-  float *vptr0 = outptr;
-
-  int channels_remaining = n_channels;
-  if (channels_remaining >= 4)
-  {
-    // Process blocks of 4 channels at a time
-    int n_iters = channels_remaining / 4 - 1;
-    channels_remaining %= 4;
-
-    asm volatile(
-        // Prepare aliases
-        "qW13 .req q0\n" "vW13 .req v0\n"
-        "qU15 .req q1\n" "qU73 .req q1\n" "qU45 .req q1\n" "qU14 .req q1\n"
-        "vU15 .req v1\n" "vU73 .req v1\n" "vU45 .req v1\n" "vU14 .req v1\n"
-        "qU62 .req q2\n" "qV12 .req q2\n" "vU62 .req v2\n" "vV12 .req v2\n"
-        "qU51 .req q3\n" "qU43 .req q3\n" "qU55 .req q3\n"
-        "vU51 .req v3\n" "vU43 .req v3\n" "vU55 .req v3\n"
-        "qU77 .req q4\n" "qV13 .req q4\n" "qV31 .req q4\n" "qU44 .req q4\n"
-        "vU77 .req v4\n" "vV13 .req v4\n" "vV31 .req v4\n" "vU44 .req v4\n"
-        "qV33 .req q5\n" "qU46 .req q5\n" "qU11 .req q5\n" "qU37 .req q5\n"
-        "vV33 .req v5\n" "vU46 .req v5\n" "vU11 .req v5\n" "vU37 .req v5\n"
-        "qU56 .req q6\n" "qU25 .req q6\n" "qU32 .req q6\n"
-        "vU56 .req v6\n" "vU25 .req v6\n" "vU32 .req v6\n"
-        "qU72 .req q7\n" "qV22 .req q7\n" "vU72 .req v7\n" "vV22 .req v7\n"
-        "qU67 .req q8\n" "qU61 .req q8\n" "qU13 .req q8\n"
-        "vU67 .req v8\n" "vU61 .req v8\n" "vU13 .req v8\n"
-        "qU74 .req q9\n" "qU34 .req q9\n" "qU17 .req q9\n" "qU66 .req q9\n"
-        "vU74 .req v9\n" "vU34 .req v9\n" "vU17 .req v9\n" "vU66 .req v9\n"
-        "qU33 .req q10\n" "qU57 .req q10\n" "qU21 .req q10\n"
-        "vU33 .req v10\n" "vU57 .req v10\n" "vU21 .req v10\n" "qW23 .req q11\n"
-        "vW23 .req v11\n" "qU42 .req q12\n" "qV23 .req q12\n" "qU23 .req q12\n"
-        "vU42 .req v12\n" "vV23 .req v12\n" "vU23 .req v12\n"
-        "qW33 .req q13\n" "vW33 .req v13\n"
-        "qU76 .req q14\n" "qU47 .req q14\n" "qU64 .req q14\n" "qU41 .req q14\n"
-        "vU76 .req v14\n" "vU47 .req v14\n" "vU64 .req v14\n" "vU41 .req v14\n"
-        "qU52 .req q15\n" "qU54 .req q15\n" "qU75 .req q15\n" "qU26 .req q15\n"
-        "vU52 .req v15\n" "vU54 .req v15\n" "vU75 .req v15\n" "vU26 .req v15\n"
-        "qU53 .req q16\n" "qU27 .req q16\n" "vU53 .req v16\n" "vU27 .req v16\n"
-        "qV21 .req q17\n" "qU65 .req q17\n" "vV21 .req v17\n" "vU65 .req v17\n"
-        "qU31 .req q18\n" "qU24 .req q18\n" "qU36 .req q18\n"
-        "vU31 .req v18\n" "vU24 .req v18\n" "vU36 .req v18\n" "qU22 .req q19\n"
-        "vU22 .req v19\n" "qU35 .req q20\n" "qU63 .req q20\n"
-        "vU35 .req v20\n" "vU63 .req v20\n" "qW12 .req q21\n"
-        "vW12 .req v21\n" "qV32 .req q22\n" "qU16 .req q22\n"
-        "vV32 .req v22\n" "vU16 .req v22\n" "qW11 .req q23\n" "vW11 .req v23\n"
-        "qU12 .req q24\n" "vU12 .req v24\n" "qW31 .req q25\n" "vW31 .req v25\n"
-        "qW22 .req q26\n" "vW22 .req v26\n" "qU71 .req q27\n" "vU71 .req v27\n"
-        "qV11 .req q28\n" "vV11 .req v28\n" "qW21 .req q29\n" "vW21 .req v29\n"
-        "qW32 .req q30\n" "vW32 .req v30\n"
-
-        "uptr1 .req x0\n"
-        "uptr2 .req x1\n"
-        "uptr3 .req x2\n"
-        "uptr4 .req x3\n"
-        "uptr5 .req x4\n"
-        "uptr6 .req x5\n"
-        "u_col_stride1 .req %x[u_col_stride]\n"
-        "u_col_stride2 .req  x6\n"
-        "u_col_stride3 .req  x7\n"
-        "u_col_stride4 .req  x8\n"
-        "u_col_stride5 .req  x9\n"
-        "u_col_stride6 .req x10\n"
-        "wptr1 .req x11\n"
-        "wptr2 .req x12\n"
-        "w_col_stride1 .req %x[w_col_stride]\n"
-        "w_col_stride2 .req x13\n"
-        "vptr1 .req x14\n"
-        "vptr2 .req x15\n"
-        "v_col_stride1 .req %x[v_col_stride]\n"
-        "v_col_stride2 .req x16\n"
-
-        // Prepare strides and pointers
-        "add uptr1, %x[uptr0], %x[u_row_stride]\n"
-        "add uptr2,    uptr1 , %x[u_row_stride]\n"
-        "add uptr3,    uptr2 , %x[u_row_stride]\n"
-        "add uptr4,    uptr3 , %x[u_row_stride]\n"
-        "add uptr5,    uptr4 , %x[u_row_stride]\n"
-        "add uptr6,    uptr5 , %x[u_row_stride]\n"
-        "add u_col_stride2, u_col_stride1, u_col_stride1\n"
-        "add u_col_stride3, u_col_stride2, u_col_stride1\n"
-        "add u_col_stride4, u_col_stride3, u_col_stride1\n"
-        "add u_col_stride5, u_col_stride4, u_col_stride1\n"
-        "add u_col_stride6, u_col_stride5, u_col_stride1\n"
-
-        "add wptr1, %x[wptr0], %x[w_row_stride]\n"
-        "add wptr2,    wptr1 , %x[w_row_stride]\n"
-        "add w_col_stride2, w_col_stride1, w_col_stride1\n"
-
-        "add vptr1, %x[vptr0], %x[v_row_stride]\n"
-        "add vptr2,    vptr1 , %x[v_row_stride]\n"
-        "add v_col_stride2, v_col_stride1, v_col_stride1\n"
-
-        // Prepare for first iteration
-        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
-        "ldr qW23, [wptr1, w_col_stride2]\n"
-        "ldr qW33, [wptr2, w_col_stride2]\n"
-        "ldr qW12, [%x[wptr0], w_col_stride1]\n"
-        "ldr qW22, [wptr1, w_col_stride1]\n"
-        "ldr qW32, [wptr2, w_col_stride1]\n"
-        "ldr qW11, [%x[wptr0]], #0x10\n"
-        "ldr qW21, [wptr1], #0x10\n"
-        "ldr qU17, [%x[uptr0], u_col_stride6]\n"
-        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
-        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
-        "ldr qU37, [uptr2, u_col_stride6]\n"
-        "ldr qU35, [uptr2, u_col_stride4]\n"
-        "ldr qU36, [uptr2, u_col_stride5]\n"
-        "ldr qU27, [uptr1, u_col_stride6]\n"
-        "ldr qU25, [uptr1, u_col_stride4]\n"
-        "fmul vV13.4s, vU17.4s, vW13.4s\n"
-        "fmul vV12.4s, vU15.4s, vW13.4s\n"
-        "fmla vV13.4s, vU15.4s, vW11.4s\n"
-        "ldr qW31, [wptr2], #0x10\n"
-        "fmla vV13.4s, vU16.4s, vW12.4s\n"
-        "ldr qU26, [uptr1, u_col_stride5]\n"
-        "fmla vV13.4s, vU37.4s, vW33.4s\n"
-        "ldr qU47, [uptr3, u_col_stride6]\n"
-        "fmul vV23.4s, vU37.4s, vW13.4s\n"
-        "ldr qU45, [uptr3, u_col_stride4]\n"
-        "fmla vV12.4s, vU35.4s, vW33.4s\n"
-        "ldr qU46, [uptr3, u_col_stride5]\n"
-        "fmla vV13.4s, vU35.4s, vW31.4s\n"
-        "ldr qU67, [uptr5, u_col_stride6]\n"
-        "fmul vV22.4s, vU35.4s, vW13.4s\n"
-        "cbz %x[n_iters], 2f\n"  // Jump to tail if no iterations
-
-        "1:"  // Loop body
-        "fmla vV23.4s, vU35.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, u_col_stride4]\n"
-        "fmla vV13.4s, vU36.4s, vW32.4s\n"
-        "fmla vV23.4s, vU36.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, u_col_stride5]\n"
-        "fmla vV13.4s, vU27.4s, vW23.4s\n"
-        "ldr qU57, [uptr4, u_col_stride6]\n"
-        "fmla vV12.4s, vU25.4s, vW23.4s\n"
-        "ldr qU55, [uptr4, u_col_stride4]\n"
-        "fmla vV13.4s, vU25.4s, vW21.4s\n"
-        "ldr qU56, [uptr4, u_col_stride5]\n"
-        "fmla vV13.4s, vU26.4s, vW22.4s\n"
-        "str qV13, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV23.4s, vU47.4s, vW23.4s\n"
-        "ldr qU77, [uptr6, u_col_stride6]\n"
-        "fmla vV22.4s, vU45.4s, vW23.4s\n"
-        "fmla vV23.4s, vU45.4s, vW21.4s\n"
-        "ldr qU75, [uptr6, u_col_stride4]\n"
-        "fmla vV23.4s, vU46.4s, vW22.4s\n"
-        "ldr qU76, [uptr6, u_col_stride5]\n"
-        "fmul vV33.4s, vU67.4s, vW23.4s\n"
-        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
-        "fmul vV32.4s, vU65.4s, vW23.4s\n"
-        "fmla vV33.4s, vU65.4s, vW21.4s\n"
-        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV33.4s, vU66.4s, vW22.4s\n"
-        "ldr qU34, [uptr2, u_col_stride3]\n"
-        "fmla vV23.4s, vU57.4s, vW33.4s\n"
-        "fmla vV33.4s, vU57.4s, vW13.4s\n"
-        "ldr qU33, [uptr2, u_col_stride2]\n"
-        "fmla vV22.4s, vU55.4s, vW33.4s\n"
-        "fmla vV23.4s, vU55.4s, vW31.4s\n"
-        "fmla vV32.4s, vU55.4s, vW13.4s\n"
-        "fmla vV33.4s, vU55.4s, vW11.4s\n"
-        "ldr qU24, [uptr1, u_col_stride3]\n"
-        "fmla vV23.4s, vU56.4s, vW32.4s\n"
-        "str qV23, [vptr1, v_col_stride2]\n"
-        "fmla vV33.4s, vU56.4s, vW12.4s\n"
-        "ldr qU23, [uptr1, u_col_stride2]\n"
-        "fmla vV33.4s, vU77.4s, vW33.4s\n"
-        "ldr qU44, [uptr3, u_col_stride3]\n"
-        "fmla vV32.4s, vU75.4s, vW33.4s\n"
-        "fmla vV33.4s, vU75.4s, vW31.4s\n"
-        "ldr qU43, [uptr3, u_col_stride2]\n"
-        "fmla vV33.4s, vU76.4s, vW32.4s\n"
-        "str qV33, [vptr2, v_col_stride2]\n"
-        "ldr qU64, [uptr5, u_col_stride3]\n"
-        "fmla vV12.4s, vU14.4s, vW12.4s\n"
-        "ldr qU63, [uptr5, u_col_stride2]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "fmla vV12.4s, vU13.4s, vW11.4s\n"
-        "ldr qU54, [uptr4, u_col_stride3]\n"
-        "fmla vV12.4s, vU34.4s, vW32.4s\n"
-        "fmla vV22.4s, vU34.4s, vW12.4s\n"
-        "ldr qU53, [uptr4, u_col_stride2]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "ldr qU74, [uptr6, u_col_stride3]\n"
-        "fmla vV12.4s, vU33.4s, vW31.4s\n"
-        "ldr qU73, [uptr6, u_col_stride2]\n"
-        "fmul vV21.4s, vU33.4s, vW13.4s\n"
-        "ldr qU12, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV22.4s, vU33.4s, vW11.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV12.4s, vU24.4s, vW22.4s\n"
-        "ldr qU32, [uptr2, u_col_stride1]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV12.4s, vU23.4s, vW21.4s\n"
-        "str qV12, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV22.4s, vU44.4s, vW22.4s\n"
-        "ldr qU22, [uptr1, u_col_stride1]\n"
-        "fmla vV21.4s, vU43.4s, vW23.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV22.4s, vU43.4s, vW21.4s\n"
-        "ldr qU42, [uptr3, u_col_stride1]\n"
-        "fmla vV32.4s, vU64.4s, vW22.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmul vV31.4s, vU63.4s, vW23.4s\n"
-        "ldr qW23, [wptr1, w_col_stride2]\n"
-        "fmla vV32.4s, vU63.4s, vW21.4s\n"
-        "ldr qU62, [uptr5, u_col_stride1]\n"
-        "fmla vV22.4s, vU54.4s, vW32.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV32.4s, vU54.4s, vW12.4s\n"
-        "ldr qU52, [uptr4, u_col_stride1]\n"
-        "fmla vV21.4s, vU53.4s, vW33.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV22.4s, vU53.4s, vW31.4s\n"
-        "str qV22, [vptr1, v_col_stride1]\n"
-        "fmla vV31.4s, vU53.4s, vW13.4s\n"
-        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV32.4s, vU53.4s, vW11.4s\n"
-        "ldr qU72, [uptr6, u_col_stride1]\n"
-        "fmla vV32.4s, vU74.4s, vW32.4s\n"
-        "ldr qU71, [uptr6], #0x10\n"
-        "fmla vV31.4s, vU73.4s, vW33.4s\n"
-        "ldr qW33, [wptr2, w_col_stride2]\n"
-        "fmla vV32.4s, vU73.4s, vW31.4s\n"
-        "str qV32, [vptr2, v_col_stride1]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "ldr qU17, [%x[uptr0], u_col_stride6]\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
-        "fmla vV21.4s, vU32.4s, vW12.4s\n"
-        "ldr qU37, [uptr2, u_col_stride6]\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "ldr qU35, [uptr2, u_col_stride4]\n"
-        "fmla vV21.4s, vU31.4s, vW11.4s\n"
-        "ldr qU36, [uptr2, u_col_stride5]\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "ldr qU27, [uptr1, u_col_stride6]\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW22.4s\n"
-        "ldr qU25, [uptr1, u_col_stride4]\n"
-        "fmla vV21.4s, vU41.4s, vW21.4s\n"
-        "fmla vV31.4s, vU62.4s, vW22.4s\n"
-        "ldr qW22, [wptr1, w_col_stride1]\n"
-        "fmla vV31.4s, vU61.4s, vW21.4s\n"
-        "ldr qW21, [wptr1], #0x10\n"
-        "fmla vV21.4s, vU52.4s, vW32.4s\n"
-        "fmla vV31.4s, vU52.4s, vW12.4s\n"
-        "ldr qW12, [%x[wptr0], w_col_stride1]\n"
-        "fmla vV21.4s, vU51.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU51.4s, vW11.4s\n"
-        "ldr qW11, [%x[wptr0]], #0x10\n"
-        "fmla vV31.4s, vU72.4s, vW32.4s\n"
-        "ldr qW32, [wptr2, w_col_stride1]\n"
-        "fmla vV31.4s, vU71.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-        "fmul vV13.4s, vU17.4s, vW13.4s\n"
-        "fmul vV12.4s, vU15.4s, vW13.4s\n"
-        "subs %x[n_iters], %x[n_iters], #1\n"
-        "fmla vV13.4s, vU15.4s, vW11.4s\n"
-        "ldr qW31, [wptr2], #0x10\n"
-        "fmla vV13.4s, vU16.4s, vW12.4s\n"
-        "ldr qU26, [uptr1, u_col_stride5]\n"
-        "fmla vV13.4s, vU37.4s, vW33.4s\n"
-        "ldr qU47, [uptr3, u_col_stride6]\n"
-        "fmul vV23.4s, vU37.4s, vW13.4s\n"
-        "ldr qU45, [uptr3, u_col_stride4]\n"
-        "fmla vV12.4s, vU35.4s, vW33.4s\n"
-        "ldr qU46, [uptr3, u_col_stride5]\n"
-        "fmla vV13.4s, vU35.4s, vW31.4s\n"
-        "ldr qU67, [uptr5, u_col_stride6]\n"
-        "fmul vV22.4s, vU35.4s, vW13.4s\n"
-        "bne 1b\n"
-
-        "2:"  // Tail iteration
-        "fmla vV23.4s, vU35.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, u_col_stride4]\n"
-        "fmla vV13.4s, vU36.4s, vW32.4s\n"
-        "fmla vV23.4s, vU36.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, u_col_stride5]\n"
-        "fmla vV13.4s, vU27.4s, vW23.4s\n"
-        "ldr qU57, [uptr4, u_col_stride6]\n"
-        "fmla vV12.4s, vU25.4s, vW23.4s\n"
-        "ldr qU55, [uptr4, u_col_stride4]\n"
-        "fmla vV13.4s, vU25.4s, vW21.4s\n"
-        "ldr qU56, [uptr4, u_col_stride5]\n"
-        "fmla vV13.4s, vU26.4s, vW22.4s\n"
-        "str qV13, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV23.4s, vU47.4s, vW23.4s\n"
-        "ldr qU77, [uptr6, u_col_stride6]\n"
-        "fmla vV22.4s, vU45.4s, vW23.4s\n"
-        "fmla vV23.4s, vU45.4s, vW21.4s\n"
-        "ldr qU75, [uptr6, u_col_stride4]\n"
-        "fmla vV23.4s, vU46.4s, vW22.4s\n"
-        "ldr qU76, [uptr6, u_col_stride5]\n"
-        "fmul vV33.4s, vU67.4s, vW23.4s\n"
-        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
-        "fmul vV32.4s, vU65.4s, vW23.4s\n"
-        "fmla vV33.4s, vU65.4s, vW21.4s\n"
-        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV33.4s, vU66.4s, vW22.4s\n"
-        "ldr qU34, [uptr2, u_col_stride3]\n"
-        "fmla vV23.4s, vU57.4s, vW33.4s\n"
-        "fmla vV33.4s, vU57.4s, vW13.4s\n"
-        "ldr qU33, [uptr2, u_col_stride2]\n"
-        "fmla vV22.4s, vU55.4s, vW33.4s\n"
-        "fmla vV23.4s, vU55.4s, vW31.4s\n"
-        "fmla vV32.4s, vU55.4s, vW13.4s\n"
-        "fmla vV33.4s, vU55.4s, vW11.4s\n"
-        "ldr qU24, [uptr1, u_col_stride3]\n"
-        "fmla vV23.4s, vU56.4s, vW32.4s\n"
-        "str qV23, [vptr1, v_col_stride2]\n"
-        "fmla vV33.4s, vU56.4s, vW12.4s\n"
-        "ldr qU23, [uptr1, u_col_stride2]\n"
-        "fmla vV33.4s, vU77.4s, vW33.4s\n"
-        "ldr qU44, [uptr3, u_col_stride3]\n"
-        "fmla vV32.4s, vU75.4s, vW33.4s\n"
-        "fmla vV33.4s, vU75.4s, vW31.4s\n"
-        "ldr qU43, [uptr3, u_col_stride2]\n"
-        "fmla vV33.4s, vU76.4s, vW32.4s\n"
-        "str qV33, [vptr2, v_col_stride2]\n"
-        "ldr qU64, [uptr5, u_col_stride3]\n"
-        "fmla vV12.4s, vU14.4s, vW12.4s\n"
-        "ldr qU63, [uptr5, u_col_stride2]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "fmla vV12.4s, vU13.4s, vW11.4s\n"
-        "ldr qU54, [uptr4, u_col_stride3]\n"
-        "fmla vV12.4s, vU34.4s, vW32.4s\n"
-        "fmla vV22.4s, vU34.4s, vW12.4s\n"
-        "ldr qU53, [uptr4, u_col_stride2]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "ldr qU74, [uptr6, u_col_stride3]\n"
-        "fmla vV12.4s, vU33.4s, vW31.4s\n"
-        "ldr qU73, [uptr6, u_col_stride2]\n"
-        "fmul vV21.4s, vU33.4s, vW13.4s\n"
-        "ldr qU12, [%x[uptr0], u_col_stride1]\n"
-        "fmla vV22.4s, vU33.4s, vW11.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV12.4s, vU24.4s, vW22.4s\n"
-        "ldr qU32, [uptr2, u_col_stride1]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV12.4s, vU23.4s, vW21.4s\n"
-        "str qV12, [%x[vptr0], v_col_stride1]\n"
-        "fmla vV22.4s, vU44.4s, vW22.4s\n"
-        "ldr qU22, [uptr1, u_col_stride1]\n"
-        "fmla vV21.4s, vU43.4s, vW23.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV22.4s, vU43.4s, vW21.4s\n"
-        "ldr qU42, [uptr3, u_col_stride1]\n"
-        "fmla vV32.4s, vU64.4s, vW22.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmul vV31.4s, vU63.4s, vW23.4s\n"
-        "fmla vV32.4s, vU63.4s, vW21.4s\n"
-        "ldr qU62, [uptr5, u_col_stride1]\n"
-        "fmla vV22.4s, vU54.4s, vW32.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV32.4s, vU54.4s, vW12.4s\n"
-        "ldr qU52, [uptr4, u_col_stride1]\n"
-        "fmla vV21.4s, vU53.4s, vW33.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV22.4s, vU53.4s, vW31.4s\n"
-        "str qV22, [vptr1, v_col_stride1]\n"
-        "fmla vV31.4s, vU53.4s, vW13.4s\n"
-        "fmla vV32.4s, vU53.4s, vW11.4s\n"
-        "ldr qU72, [uptr6, u_col_stride1]\n"
-        "fmla vV32.4s, vU74.4s, vW32.4s\n"
-        "ldr qU71, [uptr6], #0x10\n"
-        "fmla vV31.4s, vU73.4s, vW33.4s\n"
-        "fmla vV32.4s, vU73.4s, vW31.4s\n"
-        "str qV32, [vptr2, v_col_stride1]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "fmla vV21.4s, vU32.4s, vW12.4s\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "fmla vV21.4s, vU31.4s, vW11.4s\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW22.4s\n"
-        "fmla vV21.4s, vU41.4s, vW21.4s\n"
-        "fmla vV31.4s, vU62.4s, vW22.4s\n"
-        "fmla vV31.4s, vU61.4s, vW21.4s\n"
-        "fmla vV21.4s, vU52.4s, vW32.4s\n"
-        "fmla vV31.4s, vU52.4s, vW12.4s\n"
-        "fmla vV21.4s, vU51.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU51.4s, vW11.4s\n"
-        "fmla vV31.4s, vU72.4s, vW32.4s\n"
-        "fmla vV31.4s, vU71.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-
-        // Clear aliases
-        ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
-        ".unreq uptr5\n" ".unreq uptr6\n"
-        ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" ".unreq u_col_stride3\n"
-        ".unreq u_col_stride4\n" ".unreq u_col_stride5\n" ".unreq u_col_stride6\n"
-        ".unreq wptr1\n" ".unreq wptr2\n"
-        ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
-        ".unreq vptr1\n" ".unreq vptr2\n"
-        ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
-        ".unreq qU15\n" ".unreq qU73\n" ".unreq qU45\n" ".unreq qU14\n"
-        ".unreq qW13\n" ".unreq qU62\n" ".unreq qV12\n"
-        ".unreq qU51\n" ".unreq qU43\n" ".unreq qU55\n"
-        ".unreq qU77\n" ".unreq qV13\n" ".unreq qV31\n" ".unreq qU44\n"
-        ".unreq qV33\n" ".unreq qU46\n" ".unreq qU11\n" ".unreq qU37\n"
-        ".unreq qU56\n" ".unreq qU25\n" ".unreq qU32\n"
-        ".unreq qU72\n" ".unreq qV22\n"
-        ".unreq qU67\n" ".unreq qU61\n" ".unreq qU13\n" ".unreq qW33\n"
-        ".unreq qU74\n" ".unreq qU34\n" ".unreq qU17\n" ".unreq qU66\n"
-        ".unreq qU33\n" ".unreq qU57\n" ".unreq qU21\n"
-        ".unreq qW23\n" ".unreq qU42\n" ".unreq qV23\n" ".unreq qU23\n"
-        ".unreq qU76\n" ".unreq qU47\n" ".unreq qU64\n" ".unreq qU41\n"
-        ".unreq qU52\n" ".unreq qU54\n" ".unreq qU75\n" ".unreq qU26\n"
-        ".unreq qU53\n" ".unreq qU27\n"
-        ".unreq qV21\n" ".unreq qU65\n"
-        ".unreq qU31\n" ".unreq qU24\n" ".unreq qU36\n" ".unreq qU22\n"
-        ".unreq qU35\n" ".unreq qU63\n" ".unreq qW12\n"
-        ".unreq qV32\n" ".unreq qU16\n" ".unreq qW11\n" ".unreq qU12\n"
-        ".unreq qW31\n" ".unreq qW22\n" ".unreq qU71\n" ".unreq qV11\n"
-        ".unreq qW21\n" ".unreq qW32\n" ".unreq vW13\n"
-        ".unreq vU15\n" ".unreq vU73\n" ".unreq vU45\n" ".unreq vU14\n"
-        ".unreq vU62\n" ".unreq vV12\n"
-        ".unreq vU51\n" ".unreq vU43\n" ".unreq vU55\n"
-        ".unreq vU77\n" ".unreq vV13\n" ".unreq vV31\n" ".unreq vU44\n"
-        ".unreq vV33\n" ".unreq vU46\n" ".unreq vU11\n" ".unreq vU37\n"
-        ".unreq vU56\n" ".unreq vU25\n" ".unreq vU32\n"
-        ".unreq vU72\n" ".unreq vV22\n" ".unreq vW21\n" ".unreq vW32\n"
-        ".unreq vU67\n" ".unreq vU61\n" ".unreq vU13\n"
-        ".unreq vU74\n" ".unreq vU34\n" ".unreq vU17\n" ".unreq vU66\n"
-        ".unreq vU33\n" ".unreq vU57\n" ".unreq vU21\n" ".unreq vW23\n"
-        ".unreq vU42\n" ".unreq vV23\n" ".unreq vU23\n" ".unreq vW33\n"
-        ".unreq vU76\n" ".unreq vU47\n" ".unreq vU64\n" ".unreq vU41\n"
-        ".unreq vU52\n" ".unreq vU54\n" ".unreq vU75\n" ".unreq vU26\n"
-        ".unreq vU53\n" ".unreq vU27\n" ".unreq vV21\n" ".unreq vU65\n"
-        ".unreq vU31\n" ".unreq vU24\n" ".unreq vU36\n" ".unreq vU22\n"
-        ".unreq vU35\n" ".unreq vU63\n" ".unreq vW12\n"
-        ".unreq vV32\n" ".unreq vU16\n" ".unreq vW11\n" ".unreq vU12\n"
-        ".unreq vW31\n" ".unreq vW22\n" ".unreq vU71\n" ".unreq vV11\n"
-        : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
-          [n_iters] "+r" (n_iters)
-        : [u_row_stride] "r" (in_row_stride * sizeof(float)),
-          [u_col_stride] "r" (in_col_stride * sizeof(float)),
-          [w_row_stride] "r" (weight_row_stride * sizeof(float)),
-          [w_col_stride] "r" (weight_col_stride * sizeof(float)),
-          [v_row_stride] "r" (out_row_stride * sizeof(float)),
-          [v_col_stride] "r" (out_col_stride * sizeof(float))
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-          "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-          "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
-          "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
-          "x12", "x13", "x14", "x15", "x16", "cc", "memory"
-    );
-  }
-  if (channels_remaining)
-  {
-    // Fall back on the unoptimised version to clean up the tail
-    ConvImpl::process_tile<false>(
-        channels_remaining,
-        wptr0, weight_row_stride, weight_col_stride,
-        uptr0, in_row_stride, in_col_stride,
-        vptr0, out_row_stride, out_col_stride,
-        0, 0, 0, 0, 0, 0
-    );
-  }
+  __asm __volatile(
+    "add x15, %[inptr0], %[input_row_stride]\n"
+    "add x26, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x21, %[outptr0], %[output_row_stride]\n"
+    "add x16, x15, %[input_row_stride]\n"
+    "add x27, x26, %[input_col_stride1]\n"
+    "add x22, x21, %[output_row_stride]\n"
+    "add x17, x16, %[input_row_stride]\n"
+    "add x28, x27, %[input_col_stride1]\n"
+    "add x23, %[output_col_stride1], %[output_col_stride1]\n"
+    "add x18, x17, %[input_row_stride]\n"
+    "add x13, x28, %[input_col_stride1]\n"
+    "and x24, %[n_channels], #3\n"
+    "add x19, x18, %[input_row_stride]\n"
+    "add x14, x13, %[input_col_stride1]\n"
+    "lsr x25, %[n_channels], #2\n"
+    "add x20, x19, %[input_row_stride]\n"
+    "cbz x25, 4f\n"
+    "1:\n"
+    "ldr q27, [%[wbptr]]\n"
+    "subs x25, x25, #1\n"
+    "mov v17.16b, v27.16b\n"
+    "ldr q6, [%[wbptr], #16]\n"
+    "mov v16.16b, v27.16b\n"
+    "ldr q14, [%[wbptr], #32]\n"
+    "mov v15.16b, v27.16b\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "mov v2.16b, v27.16b\n"
+    "ldr q12, [%[wbptr], #64]\n"
+    "mov v4.16b, v27.16b\n"
+    "ldr q11, [%[wbptr], #80]\n"
+    "mov v5.16b, v27.16b\n"
+    "ldr q10, [%[wbptr], #96]\n"
+    "mov v1.16b, v27.16b\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "mov v3.16b, v27.16b\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "mov v0.16b, v27.16b\n"
+    "ldr q7, [%[wbptr], #144]\n"
+    "ldr q29, [%[inptr0]]\n"
+    "ldr q28, [x15]\n"
+    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q22, [x16]\n"
+    "ldr q20, [x15, %[input_col_stride1]]\n"
+    "ldr q19, [%[inptr0], x26]\n"
+    "ldr q30, [x17]\n"
+    "ldr q18, [x16, %[input_col_stride1]]\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v17.4s, v29.4s, v6.4s\n"
+    "ldr q21, [x15, x26]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "ldr q27, [%[inptr0], x27]\n"
+    "fmla v15.4s, v19.4s, v6.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v28.4s, v12.4s\n"
+    "ldr q25, [x18]\n"
+    "fmla v16.4s, v30.4s, v12.4s\n"
+    "ldr q24, [x17, %[input_col_stride1]]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v17.4s, v26.4s, v14.4s\n"
+    "ldr q23, [x16, x26]\n"
+    "fmla v16.4s, v18.4s, v14.4s\n"
+    "subs x25, x25, #1\n"
+    "fmla v15.4s, v27.4s, v14.4s\n"
+    "ldr q26, [x15, x27]\n"
+    "fmla v17.4s, v22.4s, v9.4s\n"
+    "ldr q22, [%[inptr0], x28]\n"
+    "fmla v16.4s, v25.4s, v9.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v23.4s, v9.4s\n"
+    "ldr q30, [x19]\n"
+    "fmla v17.4s, v20.4s, v11.4s\n"
+    "ldr q29, [x18, %[input_col_stride1]]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "ldr q28, [x17, x26]\n"
+    "fmla v4.4s, v23.4s, v6.4s\n"
+    "fmla v15.4s, v26.4s, v11.4s\n"
+    "fmla v17.4s, v19.4s, v13.4s\n"
+    "ldr q24, [x16, x27]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "ldr q25, [x15, x28]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "fmla v5.4s, v22.4s, v6.4s\n"
+    "fmla v17.4s, v18.4s, v8.4s\n"
+    "ldr q19, [%[inptr0], x13]\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr q18, [x20]\n"
+    "fmla v16.4s, v29.4s, v8.4s\n"
+    "ldr q22, [x19, %[input_col_stride1]]\n"
+    "fmla v17.4s, v21.4s, v10.4s\n"
+    "ldr q26, [x18, x26]\n"
+    "fmla v2.4s, v29.4s, v14.4s\n"
+    "ldr q20, [x17, x27]\n"
+    "fmla v16.4s, v28.4s, v10.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v17.4s, v23.4s, v7.4s\n"
+    "ldr q27, [x16, x28]\n"
+    "fmla v15.4s, v24.4s, v8.4s\n"
+    "ldr q30, [x15, x13]\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "ldr q24, [%[inptr0], x14]\n"
+    "str q17, [%[outptr0]]\n"
+    "fmla v5.4s, v25.4s, v12.4s\n"
+    "fmla v15.4s, v25.4s, v10.4s\n"
+    "ldr q28, [x20, %[input_col_stride1]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "ldr q17, [x19, x26]\n"
+    "fmla v5.4s, v19.4s, v14.4s\n"
+    "ldr q18, [x18, x27]\n"
+    "fmla v16.4s, v26.4s, v7.4s\n"
+    "ldr q25, [x17, x28]\n"
+    "fmla v2.4s, v22.4s, v11.4s\n"
+    "ldr q22, [x16, x13]\n"
+    "fmla v4.4s, v26.4s, v9.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "str q16, [x21]\n"
+    "fmla v1.4s, v26.4s, v6.4s\n"
+    "fmla v2.4s, v26.4s, v13.4s\n"
+    "ldr q21, [x15, x14]\n"
+    "fmla v4.4s, v20.4s, v11.4s\n"
+    "ldr q23, [x20, x26]\n"
+    "fmla v15.4s, v27.4s, v7.4s\n"
+    "ldr q19, [x19, x27]\n"
+    "fmla v5.4s, v27.4s, v9.4s\n"
+    "add x15, x15, #16\n"
+    "fmla v4.4s, v27.4s, v13.4s\n"
+    "fmla v3.4s, v27.4s, v6.4s\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v2.4s, v28.4s, v8.4s\n"
+    "fmla v5.4s, v30.4s, v11.4s\n"
+    "ldr q29, [x18, x28]\n"
+    "fmla v1.4s, v17.4s, v12.4s\n"
+    "ldr q27, [x17, x13]\n"
+    "fmla v2.4s, v17.4s, v10.4s\n"
+    "ldr q28, [x16, x14]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "ldr q26, [x20, x27]\n"
+    "fmla v4.4s, v18.4s, v8.4s\n"
+    "ldr q20, [x19, x28]\n"
+    "fmla v1.4s, v18.4s, v14.4s\n"
+    "ldr q17, [x18, x13]\n"
+    "fmla v3.4s, v25.4s, v12.4s\n"
+    "ldr q18, [x17, x14]\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v5.4s, v22.4s, v8.4s\n"
+    "add x16, x16, #16\n"
+    "fmla v3.4s, v22.4s, v14.4s\n"
+    "ldr q15, [x19, x13]\n"
+    "fmla v2.4s, v23.4s, v7.4s\n"
+    "add x17, x17, #16\n"
+    "fmla v5.4s, v21.4s, v10.4s\n"
+    "ldr q21, [x18, x14]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr q23, [x20, x13]\n"
+    "str q2, [x22]\n"
+    "fmla v4.4s, v29.4s, v7.4s\n"
+    "fmla v3.4s, v29.4s, v9.4s\n"
+    "ldr q24, [x19, x14]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "ldr q25, [x20, x14]\n"
+    "str q4, [x21, %[output_col_stride1]]\n"
+    "fmla v0.4s, v29.4s, v6.4s\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "ldr q27, [%[wbptr]]\n"
+    "fmla v1.4s, v29.4s, v13.4s\n"
+    "ldr q29, [%[inptr0]]\n"
+    "fmla v5.4s, v28.4s, v7.4s\n"
+    "ldr q6, [%[wbptr], #16]\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "ldr q28, [x15]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+    "str q5, [%[outptr0], x23]\n"
+    "fmla v0.4s, v20.4s, v12.4s\n"
+    "fmla v3.4s, v17.4s, v8.4s\n"
+    "ldr q22, [x16]\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "ldr q20, [x15, %[input_col_stride1]]\n"
+    "fmla v0.4s, v17.4s, v14.4s\n"
+    "ldr q12, [%[wbptr], #64]\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "ldr q19, [%[inptr0], x26]\n"
+    "fmla v1.4s, v16.4s, v7.4s\n"
+    "ldr q30, [x17]\n"
+    "fmla v0.4s, v16.4s, v9.4s\n"
+    "ldr q14, [%[wbptr], #32]\n"
+    "fmla v3.4s, v21.4s, v7.4s\n"
+    "ldr q18, [x16, %[input_col_stride1]]\n"
+    "str q1, [x22, %[output_col_stride1]]\n"
+    "mov v17.16b, v27.16b\n"
+    "fmla v0.4s, v15.4s, v11.4s\n"
+    "ldr q9, [%[wbptr], #112]\n"
+    "str q3, [x21, x23]\n"
+    "mov v16.16b, v27.16b\n"
+    "mov v15.16b, v27.16b\n"
+    "add x18, x18, #16\n"
+    "fmla v0.4s, v21.4s, v13.4s\n"
+    "ldr q11, [%[wbptr], #80]\n"
+    "mov v2.16b, v27.16b\n"
+    "add x19, x19, #16\n"
+    "mov v4.16b, v27.16b\n"
+    "add x20, x20, #16\n"
+    "fmla v0.4s, v23.4s, v8.4s\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "mov v5.16b, v27.16b\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "mov v1.16b, v27.16b\n"
+    "add x21, x21, #16\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "ldr q8, [%[wbptr], #128]\n"
+    "mov v3.16b, v27.16b\n"
+    "fmla v0.4s, v25.4s, v7.4s\n"
+    "ldr q10, [%[wbptr], #96]\n"
+    "str q0, [x22, x23]\n"
+    "mov v0.16b, v27.16b\n"
+    "ldr q7, [%[wbptr], #144]\n"
+    "add x22, x22, #16\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v17.4s, v29.4s, v6.4s\n"
+    "ldr q21, [x15, x26]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "ldr q27, [%[inptr0], x27]\n"
+    "fmla v15.4s, v19.4s, v6.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v28.4s, v12.4s\n"
+    "ldr q25, [x18]\n"
+    "fmla v16.4s, v30.4s, v12.4s\n"
+    "ldr q24, [x17, %[input_col_stride1]]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v17.4s, v26.4s, v14.4s\n"
+    "ldr q23, [x16, x26]\n"
+    "fmla v16.4s, v18.4s, v14.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v27.4s, v14.4s\n"
+    "ldr q26, [x15, x27]\n"
+    "fmla v17.4s, v22.4s, v9.4s\n"
+    "ldr q22, [%[inptr0], x28]\n"
+    "fmla v16.4s, v25.4s, v9.4s\n"
+    "ldr q30, [x19]\n"
+    "fmla v15.4s, v23.4s, v9.4s\n"
+    "fmla v4.4s, v23.4s, v6.4s\n"
+    "fmla v17.4s, v20.4s, v11.4s\n"
+    "ldr q29, [x18, %[input_col_stride1]]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "ldr q28, [x17, x26]\n"
+    "fmla v15.4s, v26.4s, v11.4s\n"
+    "ldr q24, [x16, x27]\n"
+    "fmla v17.4s, v19.4s, v13.4s\n"
+    "ldr q25, [x15, x28]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "ldr q19, [%[inptr0], x13]\n"
+    "fmla v17.4s, v18.4s, v8.4s\n"
+    "ldr q18, [x20]\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr q22, [x19, %[input_col_stride1]]\n"
+    "fmla v16.4s, v29.4s, v8.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v17.4s, v21.4s, v10.4s\n"
+    "ldr q26, [x18, x26]\n"
+    "fmla v2.4s, v29.4s, v14.4s\n"
+    "ldr q20, [x17, x27]\n"
+    "fmla v16.4s, v28.4s, v10.4s\n"
+    "ldr q27, [x16, x28]\n"
+    "fmla v17.4s, v23.4s, v7.4s\n"
+    "ldr q30, [x15, x13]\n"
+    "fmla v15.4s, v24.4s, v8.4s\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "fmla v5.4s, v25.4s, v12.4s\n"
+    "ldr q24, [%[inptr0], x14]\n"
+    "str q17, [%[outptr0]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "fmla v15.4s, v25.4s, v10.4s\n"
+    "ldr q28, [x20, %[input_col_stride1]]\n"
+    "fmla v5.4s, v19.4s, v14.4s\n"
+    "ldr q17, [x19, x26]\n"
+    "fmla v2.4s, v22.4s, v11.4s\n"
+    "ldr q18, [x18, x27]\n"
+    "fmla v16.4s, v26.4s, v7.4s\n"
+    "ldr q25, [x17, x28]\n"
+    "fmla v4.4s, v26.4s, v9.4s\n"
+    "ldr q22, [x16, x13]\n"
+    "fmla v2.4s, v26.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "str q16, [x21]\n"
+    "fmla v1.4s, v26.4s, v6.4s\n"
+    "fmla v4.4s, v20.4s, v11.4s\n"
+    "ldr q21, [x15, x14]\n"
+    "fmla v15.4s, v27.4s, v7.4s\n"
+    "ldr q23, [x20, x26]\n"
+    "fmla v5.4s, v27.4s, v9.4s\n"
+    "ldr q19, [x19, x27]\n"
+    "fmla v4.4s, v27.4s, v13.4s\n"
+    "add x15, x15, #16\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v3.4s, v27.4s, v6.4s\n"
+    "fmla v5.4s, v30.4s, v11.4s\n"
+    "ldr q29, [x18, x28]\n"
+    "fmla v2.4s, v28.4s, v8.4s\n"
+    "ldr q27, [x17, x13]\n"
+    "fmla v1.4s, v17.4s, v12.4s\n"
+    "ldr q28, [x16, x14]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "ldr q26, [x20, x27]\n"
+    "fmla v2.4s, v17.4s, v10.4s\n"
+    "ldr q20, [x19, x28]\n"
+    "fmla v4.4s, v18.4s, v8.4s\n"
+    "ldr q17, [x18, x13]\n"
+    "fmla v1.4s, v18.4s, v14.4s\n"
+    "ldr q18, [x17, x14]\n"
+    "fmla v3.4s, v25.4s, v12.4s\n"
+    "add x16, x16, #16\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "ldr q16, [x20, x28]\n"
+    "fmla v5.4s, v22.4s, v8.4s\n"
+    "add x17, x17, #16\n"
+    "fmla v3.4s, v22.4s, v14.4s\n"
+    "ldr q15, [x19, x13]\n"
+    "fmla v2.4s, v23.4s, v7.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v5.4s, v21.4s, v10.4s\n"
+    "ldr q21, [x18, x14]\n"
+    "fmla v4.4s, v29.4s, v7.4s\n"
+    "ldr q23, [x20, x13]\n"
+    "str q2, [x22]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v3.4s, v29.4s, v9.4s\n"
+    "ldr q24, [x19, x14]\n"
+    "str q4, [x21, %[output_col_stride1]]\n"
+    "fmla v0.4s, v29.4s, v6.4s\n"
+    "fmla v1.4s, v29.4s, v13.4s\n"
+    "ldr q25, [x20, x14]\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "add x18, x18, #16\n"
+    "fmla v5.4s, v28.4s, v7.4s\n"
+    "add x19, x19, #16\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "add x20, x20, #16\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "fmla v0.4s, v20.4s, v12.4s\n"
+    "str q5, [%[outptr0], x23]\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v3.4s, v17.4s, v8.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v0.4s, v17.4s, v14.4s\n"
+    "fmla v1.4s, v16.4s, v7.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v16.4s, v9.4s\n"
+    "str q1, [x22, %[output_col_stride1]]\n"
+    "fmla v3.4s, v21.4s, v7.4s\n"
+    "fmla v0.4s, v15.4s, v11.4s\n"
+    "str q3, [x21, x23]\n"
+    "fmla v0.4s, v21.4s, v13.4s\n"
+    "add x21, x21, #16\n"
+    "fmla v0.4s, v23.4s, v8.4s\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v0.4s, v25.4s, v7.4s\n"
+    "str q0, [x22, x23]\n"
+    "add x22, x22, #16\n"
+    "4:\n"
+    "cbz x24, 7f\n"
+    "ldr s27, [%[wbptr]]\n"
+    "mov v17.16b, v27.16b\n"
+    "ldr s6, [%[wbptr], #4]\n"
+    "mov v16.16b, v27.16b\n"
+    "ldr s14, [%[wbptr], #8]\n"
+    "mov v15.16b, v27.16b\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "mov v2.16b, v27.16b\n"
+    "ldr s12, [%[wbptr], #16]\n"
+    "mov v4.16b, v27.16b\n"
+    "ldr s11, [%[wbptr], #20]\n"
+    "mov v5.16b, v27.16b\n"
+    "ldr s10, [%[wbptr], #24]\n"
+    "mov v1.16b, v27.16b\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "mov v3.16b, v27.16b\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "mov v0.16b, v27.16b\n"
+    "ldr s7, [%[wbptr], #36]\n"
+    "ldr s29, [%[inptr0]]\n"
+    "subs x24, x24, #1\n"
+    "ldr s28, [x15]\n"
+    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr s22, [x16]\n"
+    "ldr s20, [x15, %[input_col_stride1]]\n"
+    "ldr s19, [%[inptr0], x26]\n"
+    "ldr s30, [x17]\n"
+    "ldr s18, [x16, %[input_col_stride1]]\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v17.4s, v29.4s, v6.4s\n"
+    "ldr s21, [x15, x26]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "ldr s27, [%[inptr0], x27]\n"
+    "fmla v15.4s, v19.4s, v6.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v28.4s, v12.4s\n"
+    "ldr s25, [x18]\n"
+    "fmla v16.4s, v30.4s, v12.4s\n"
+    "ldr s24, [x17, %[input_col_stride1]]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v17.4s, v26.4s, v14.4s\n"
+    "ldr s23, [x16, x26]\n"
+    "fmla v16.4s, v18.4s, v14.4s\n"
+    "subs x24, x24, #1\n"
+    "fmla v15.4s, v27.4s, v14.4s\n"
+    "ldr s26, [x15, x27]\n"
+    "fmla v17.4s, v22.4s, v9.4s\n"
+    "ldr s22, [%[inptr0], x28]\n"
+    "fmla v16.4s, v25.4s, v9.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v23.4s, v9.4s\n"
+    "ldr s30, [x19]\n"
+    "fmla v17.4s, v20.4s, v11.4s\n"
+    "ldr s29, [x18, %[input_col_stride1]]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "ldr s28, [x17, x26]\n"
+    "fmla v4.4s, v23.4s, v6.4s\n"
+    "fmla v15.4s, v26.4s, v11.4s\n"
+    "fmla v17.4s, v19.4s, v13.4s\n"
+    "ldr s24, [x16, x27]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "ldr s25, [x15, x28]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "fmla v5.4s, v22.4s, v6.4s\n"
+    "fmla v17.4s, v18.4s, v8.4s\n"
+    "ldr s19, [%[inptr0], x13]\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr s18, [x20]\n"
+    "fmla v16.4s, v29.4s, v8.4s\n"
+    "ldr s22, [x19, %[input_col_stride1]]\n"
+    "fmla v17.4s, v21.4s, v10.4s\n"
+    "ldr s26, [x18, x26]\n"
+    "fmla v2.4s, v29.4s, v14.4s\n"
+    "ldr s20, [x17, x27]\n"
+    "fmla v16.4s, v28.4s, v10.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v17.4s, v23.4s, v7.4s\n"
+    "ldr s27, [x16, x28]\n"
+    "fmla v15.4s, v24.4s, v8.4s\n"
+    "ldr s30, [x15, x13]\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "ldr s24, [%[inptr0], x14]\n"
+    "str s17, [%[outptr0]]\n"
+    "fmla v5.4s, v25.4s, v12.4s\n"
+    "fmla v15.4s, v25.4s, v10.4s\n"
+    "ldr s28, [x20, %[input_col_stride1]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "ldr s17, [x19, x26]\n"
+    "fmla v5.4s, v19.4s, v14.4s\n"
+    "ldr s18, [x18, x27]\n"
+    "fmla v16.4s, v26.4s, v7.4s\n"
+    "ldr s25, [x17, x28]\n"
+    "fmla v2.4s, v22.4s, v11.4s\n"
+    "ldr s22, [x16, x13]\n"
+    "fmla v4.4s, v26.4s, v9.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "str s16, [x21]\n"
+    "fmla v1.4s, v26.4s, v6.4s\n"
+    "fmla v2.4s, v26.4s, v13.4s\n"
+    "ldr s21, [x15, x14]\n"
+    "fmla v4.4s, v20.4s, v11.4s\n"
+    "ldr s23, [x20, x26]\n"
+    "fmla v15.4s, v27.4s, v7.4s\n"
+    "ldr s19, [x19, x27]\n"
+    "fmla v5.4s, v27.4s, v9.4s\n"
+    "add x15, x15, #4\n"
+    "fmla v4.4s, v27.4s, v13.4s\n"
+    "fmla v3.4s, v27.4s, v6.4s\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v2.4s, v28.4s, v8.4s\n"
+    "fmla v5.4s, v30.4s, v11.4s\n"
+    "ldr s29, [x18, x28]\n"
+    "fmla v1.4s, v17.4s, v12.4s\n"
+    "ldr s27, [x17, x13]\n"
+    "fmla v2.4s, v17.4s, v10.4s\n"
+    "ldr s28, [x16, x14]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "ldr s26, [x20, x27]\n"
+    "fmla v4.4s, v18.4s, v8.4s\n"
+    "ldr s20, [x19, x28]\n"
+    "fmla v1.4s, v18.4s, v14.4s\n"
+    "ldr s17, [x18, x13]\n"
+    "fmla v3.4s, v25.4s, v12.4s\n"
+    "ldr s18, [x17, x14]\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "ldr s16, [x20, x28]\n"
+    "fmla v5.4s, v22.4s, v8.4s\n"
+    "add x16, x16, #4\n"
+    "fmla v3.4s, v22.4s, v14.4s\n"
+    "ldr s15, [x19, x13]\n"
+    "fmla v2.4s, v23.4s, v7.4s\n"
+    "add x17, x17, #4\n"
+    "fmla v5.4s, v21.4s, v10.4s\n"
+    "ldr s21, [x18, x14]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr s23, [x20, x13]\n"
+    "str s2, [x22]\n"
+    "fmla v4.4s, v29.4s, v7.4s\n"
+    "fmla v3.4s, v29.4s, v9.4s\n"
+    "ldr s24, [x19, x14]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "ldr s25, [x20, x14]\n"
+    "str s4, [x21, %[output_col_stride1]]\n"
+    "fmla v0.4s, v29.4s, v6.4s\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "ldr s27, [%[wbptr]]\n"
+    "fmla v1.4s, v29.4s, v13.4s\n"
+    "ldr s29, [%[inptr0]]\n"
+    "fmla v5.4s, v28.4s, v7.4s\n"
+    "ldr s6, [%[wbptr], #4]\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "ldr s28, [x15]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+    "str s5, [%[outptr0], x23]\n"
+    "fmla v0.4s, v20.4s, v12.4s\n"
+    "fmla v3.4s, v17.4s, v8.4s\n"
+    "ldr s22, [x16]\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "ldr s20, [x15, %[input_col_stride1]]\n"
+    "fmla v0.4s, v17.4s, v14.4s\n"
+    "ldr s12, [%[wbptr], #16]\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "ldr s19, [%[inptr0], x26]\n"
+    "fmla v1.4s, v16.4s, v7.4s\n"
+    "ldr s30, [x17]\n"
+    "fmla v0.4s, v16.4s, v9.4s\n"
+    "ldr s14, [%[wbptr], #8]\n"
+    "fmla v3.4s, v21.4s, v7.4s\n"
+    "ldr s18, [x16, %[input_col_stride1]]\n"
+    "str s1, [x22, %[output_col_stride1]]\n"
+    "mov v17.16b, v27.16b\n"
+    "fmla v0.4s, v15.4s, v11.4s\n"
+    "ldr s9, [%[wbptr], #28]\n"
+    "str s3, [x21, x23]\n"
+    "mov v16.16b, v27.16b\n"
+    "mov v15.16b, v27.16b\n"
+    "add x18, x18, #4\n"
+    "fmla v0.4s, v21.4s, v13.4s\n"
+    "ldr s11, [%[wbptr], #20]\n"
+    "mov v2.16b, v27.16b\n"
+    "add x19, x19, #4\n"
+    "mov v4.16b, v27.16b\n"
+    "add x20, x20, #4\n"
+    "fmla v0.4s, v23.4s, v8.4s\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "mov v5.16b, v27.16b\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "mov v1.16b, v27.16b\n"
+    "add x21, x21, #4\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "ldr s8, [%[wbptr], #32]\n"
+    "mov v3.16b, v27.16b\n"
+    "fmla v0.4s, v25.4s, v7.4s\n"
+    "ldr s10, [%[wbptr], #24]\n"
+    "str s0, [x22, x23]\n"
+    "mov v0.16b, v27.16b\n"
+    "ldr s7, [%[wbptr], #36]\n"
+    "add x22, x22, #4\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v17.4s, v29.4s, v6.4s\n"
+    "ldr s21, [x15, x26]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "ldr s27, [%[inptr0], x27]\n"
+    "fmla v15.4s, v19.4s, v6.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v28.4s, v12.4s\n"
+    "ldr s25, [x18]\n"
+    "fmla v16.4s, v30.4s, v12.4s\n"
+    "ldr s24, [x17, %[input_col_stride1]]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v17.4s, v26.4s, v14.4s\n"
+    "ldr s23, [x16, x26]\n"
+    "fmla v16.4s, v18.4s, v14.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v27.4s, v14.4s\n"
+    "ldr s26, [x15, x27]\n"
+    "fmla v17.4s, v22.4s, v9.4s\n"
+    "ldr s22, [%[inptr0], x28]\n"
+    "fmla v16.4s, v25.4s, v9.4s\n"
+    "ldr s30, [x19]\n"
+    "fmla v15.4s, v23.4s, v9.4s\n"
+    "fmla v4.4s, v23.4s, v6.4s\n"
+    "fmla v17.4s, v20.4s, v11.4s\n"
+    "ldr s29, [x18, %[input_col_stride1]]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "ldr s28, [x17, x26]\n"
+    "fmla v15.4s, v26.4s, v11.4s\n"
+    "ldr s24, [x16, x27]\n"
+    "fmla v17.4s, v19.4s, v13.4s\n"
+    "ldr s25, [x15, x28]\n"
+    "fmla v16.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "ldr s19, [%[inptr0], x13]\n"
+    "fmla v17.4s, v18.4s, v8.4s\n"
+    "ldr s18, [x20]\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr s22, [x19, %[input_col_stride1]]\n"
+    "fmla v16.4s, v29.4s, v8.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v17.4s, v21.4s, v10.4s\n"
+    "ldr s26, [x18, x26]\n"
+    "fmla v2.4s, v29.4s, v14.4s\n"
+    "ldr s20, [x17, x27]\n"
+    "fmla v16.4s, v28.4s, v10.4s\n"
+    "ldr s27, [x16, x28]\n"
+    "fmla v17.4s, v23.4s, v7.4s\n"
+    "ldr s30, [x15, x13]\n"
+    "fmla v15.4s, v24.4s, v8.4s\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "fmla v5.4s, v25.4s, v12.4s\n"
+    "ldr s24, [%[inptr0], x14]\n"
+    "str s17, [%[outptr0]]\n"
+    "fmla v2.4s, v18.4s, v9.4s\n"
+    "fmla v15.4s, v25.4s, v10.4s\n"
+    "ldr s28, [x20, %[input_col_stride1]]\n"
+    "fmla v5.4s, v19.4s, v14.4s\n"
+    "ldr s17, [x19, x26]\n"
+    "fmla v2.4s, v22.4s, v11.4s\n"
+    "ldr s18, [x18, x27]\n"
+    "fmla v16.4s, v26.4s, v7.4s\n"
+    "ldr s25, [x17, x28]\n"
+    "fmla v4.4s, v26.4s, v9.4s\n"
+    "ldr s22, [x16, x13]\n"
+    "fmla v2.4s, v26.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "str s16, [x21]\n"
+    "fmla v1.4s, v26.4s, v6.4s\n"
+    "fmla v4.4s, v20.4s, v11.4s\n"
+    "ldr s21, [x15, x14]\n"
+    "fmla v15.4s, v27.4s, v7.4s\n"
+    "ldr s23, [x20, x26]\n"
+    "fmla v5.4s, v27.4s, v9.4s\n"
+    "ldr s19, [x19, x27]\n"
+    "fmla v4.4s, v27.4s, v13.4s\n"
+    "add x15, x15, #4\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v3.4s, v27.4s, v6.4s\n"
+    "fmla v5.4s, v30.4s, v11.4s\n"
+    "ldr s29, [x18, x28]\n"
+    "fmla v2.4s, v28.4s, v8.4s\n"
+    "ldr s27, [x17, x13]\n"
+    "fmla v1.4s, v17.4s, v12.4s\n"
+    "ldr s28, [x16, x14]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "ldr s26, [x20, x27]\n"
+    "fmla v2.4s, v17.4s, v10.4s\n"
+    "ldr s20, [x19, x28]\n"
+    "fmla v4.4s, v18.4s, v8.4s\n"
+    "ldr s17, [x18, x13]\n"
+    "fmla v1.4s, v18.4s, v14.4s\n"
+    "ldr s18, [x17, x14]\n"
+    "fmla v3.4s, v25.4s, v12.4s\n"
+    "add x16, x16, #4\n"
+    "fmla v4.4s, v25.4s, v10.4s\n"
+    "ldr s16, [x20, x28]\n"
+    "fmla v5.4s, v22.4s, v8.4s\n"
+    "add x17, x17, #4\n"
+    "fmla v3.4s, v22.4s, v14.4s\n"
+    "ldr s15, [x19, x13]\n"
+    "fmla v2.4s, v23.4s, v7.4s\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "fmla v5.4s, v21.4s, v10.4s\n"
+    "ldr s21, [x18, x14]\n"
+    "fmla v4.4s, v29.4s, v7.4s\n"
+    "ldr s23, [x20, x13]\n"
+    "str s2, [x22]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v3.4s, v29.4s, v9.4s\n"
+    "ldr s24, [x19, x14]\n"
+    "str s4, [x21, %[output_col_stride1]]\n"
+    "fmla v0.4s, v29.4s, v6.4s\n"
+    "fmla v1.4s, v29.4s, v13.4s\n"
+    "ldr s25, [x20, x14]\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "add x18, x18, #4\n"
+    "fmla v5.4s, v28.4s, v7.4s\n"
+    "add x19, x19, #4\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "add x20, x20, #4\n"
+    "fmla v3.4s, v28.4s, v13.4s\n"
+    "fmla v0.4s, v20.4s, v12.4s\n"
+    "str s5, [%[outptr0], x23]\n"
+    "fmla v1.4s, v20.4s, v10.4s\n"
+    "fmla v3.4s, v17.4s, v8.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v0.4s, v17.4s, v14.4s\n"
+    "fmla v1.4s, v16.4s, v7.4s\n"
+    "fmla v3.4s, v18.4s, v10.4s\n"
+    "fmla v0.4s, v16.4s, v9.4s\n"
+    "str s1, [x22, %[output_col_stride1]]\n"
+    "fmla v3.4s, v21.4s, v7.4s\n"
+    "fmla v0.4s, v15.4s, v11.4s\n"
+    "str s3, [x21, x23]\n"
+    "fmla v0.4s, v21.4s, v13.4s\n"
+    "add x21, x21, #4\n"
+    "fmla v0.4s, v23.4s, v8.4s\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v0.4s, v25.4s, v7.4s\n"
+    "str s0, [x22, x23]\n"
+    "add x22, x22, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
+    : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
+  );
 }
-
 #endif  // __aarch64__
 
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
 
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
deleted file mode 100644
index 33b55df..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index c36c24e..a583615 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,1468 +25,5994 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
 
 #ifdef __aarch64__
-
 template <>
 template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
-  const int n_channels,
-  const float* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
-
-  // Extract parameters
-  const int in_pad_top = 0;
-  const int in_pad_left = 0;
-  const int in_pad_bottom = 0;
-  const int in_pad_right = 0;
-  const int out_pad_bottom = 0;
-  const int out_pad_right = 0;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
-  // Copy pointers
-  const float *uptr0 = inptr;
-  const float *wptr0 = weights;
-  float *vptr0 = outptr;
-  const bool same_strides = (
-    weight_col_stride == in_col_stride &&
-    weight_col_stride == out_col_stride
+  __asm __volatile(
+    "add x8, %[inptr0], %[input_row_stride]\n"
+    "add x15, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x23, %[outptr0], %[output_row_stride]\n"
+    "add x9, x8, %[input_row_stride]\n"
+    "add x16, x15, #64\n"
+    "add x17, x15, %[input_col_stride1]\n"
+    "add x10, x9, %[input_row_stride]\n"
+    "add x18, x17, #64\n"
+    "add x19, x17, %[input_col_stride1]\n"
+    "add x11, x10, %[input_row_stride]\n"
+    "add x20, x19, #64\n"
+    "add x21, x19, %[input_col_stride1]\n"
+    "add x12, x11, %[input_row_stride]\n"
+    "add x22, x21, #64\n"
+    "add x24, x23, %[output_row_stride]\n"
+    "add x25, x24, %[output_row_stride]\n"
+    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x13, %[n_channels], #3\n"
+    "add x27, x26, %[output_col_stride1]\n"
+    "lsr x14, %[n_channels], #2\n"
+    "cbz x14, 4f\n"
+    "1:\n"
+    "ldr q14, [%[wbptr]]\n"
+    "subs x14, x14, #1\n"
+    "mov v17.16b, v14.16b\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "mov v23.16b, v14.16b\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "mov v24.16b, v14.16b\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "mov v20.16b, v14.16b\n"
+    "ldr q9, [%[wbptr], #64]\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr q8, [%[wbptr], #80]\n"
+    "mov v13.16b, v14.16b\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr q6, [%[wbptr], #112]\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr q5, [%[wbptr], #128]\n"
+    "mov v2.16b, v14.16b\n"
+    "ldr q4, [%[wbptr], #144]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr q29, [%[inptr0]]\n"
+    "fmla v17.4s, v29.4s, v12.4s\n"
+    "ldr q28, [x8]\n"
+    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
+    "ldr q25, [x9]\n"
+    "ldr q26, [x8, %[input_col_stride1]]\n"
+    "ldr q27, [%[inptr0], x15]\n"
+    "ldr q15, [x10]\n"
+    "ldr q18, [x9, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x8, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "prfm pldl1keep, [x8, x28]\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "prfm pldl1keep, [x9, x28]\n"
+    "beq 3f\n"
+    "2:\n"
+    "fmla v17.4s, v28.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x16]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr q22, [x8, x15]\n"
+    "fmla v24.4s, v30.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "ldr q29, [%[inptr0], x17]\n"
+    "fmla v23.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x11, #64]\n"
+    "fmla v20.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x10, x28]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "ldr q25, [x11]\n"
+    "fmla v23.4s, v26.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x18]\n"
+    "fmla v17.4s, v26.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v16.4s, v26.4s, v12.4s\n"
+    "ldr q28, [x10, %[input_col_stride1]]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, #64]\n"
+    "fmla v17.4s, v27.4s, v10.4s\n"
+    "prfm pldl1keep, [x11, x28]\n"
+    "fmla v13.4s, v27.4s, v12.4s\n"
+    "ldr q19, [x9, x15]\n"
+    "fmla v23.4s, v15.4s, v6.4s\n"
+    "prfm pldl1keep, [x10, x16]\n"
+    "fmla v20.4s, v15.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v0.4s, v15.4s, v12.4s\n"
+    "ldr q21, [x8, x17]\n"
+    "fmla v17.4s, v18.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x20]\n"
+    "fmla v23.4s, v18.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v24.4s, v18.4s, v6.4s\n"
+    "prfm pldl1keep, [x12, x28]\n"
+    "fmla v20.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "fmla v16.4s, v18.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x18]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "ldr q27, [%[inptr0], x19]\n"
+    "fmla v17.4s, v22.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x20]\n"
+    "fmla v23.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x8, x22]\n"
+    "fmla v24.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x16]\n"
+    "fmla v16.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x18]\n"
+    "fmla v13.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x20]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr q18, [x12]\n"
+    "fmla v24.4s, v29.4s, v10.4s\n"
+    "prfm pldl1keep, [x9, x22]\n"
+    "fmla v13.4s, v29.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x18]\n"
+    "fmla v3.4s, v29.4s, v12.4s\n"
+    "ldr q22, [x11, %[input_col_stride1]]\n"
+    "fmla v20.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x20]\n"
+    "fmla v0.4s, v25.4s, v9.4s\n"
+    "ldr q25, [x10, x15]\n"
+    "fmla v23.4s, v28.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, x22]\n"
+    "fmla v20.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x20]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x22]\n"
+    "fmla v0.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x22]\n"
+    "fmla v1.4s, v28.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v23.4s, v19.4s, v7.4s\n"
+    "subs x14, x14, #1\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v10.4s\n"
+    "str q17, [%[outptr0]]\n"
+    "mov v15.16b, v14.16b\n"
+    "fmla v16.4s, v19.4s, v8.4s\n"
+    "fmla v13.4s, v19.4s, v6.4s\n"
+    "fmla v15.4s, v28.4s, v12.4s\n"
+    "ldr q29, [x9, x17]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v2.4s, v19.4s, v9.4s\n"
+    "fmla v24.4s, v21.4s, v7.4s\n"
+    "fmla v16.4s, v21.4s, v10.4s\n"
+    "fmla v13.4s, v21.4s, v8.4s\n"
+    "fmla v3.4s, v21.4s, v9.4s\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v18.16b, v14.16b\n"
+    "fmla v20.4s, v22.4s, v5.4s\n"
+    "fmla v13.4s, v27.4s, v10.4s\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "mov v17.16b, v14.16b\n"
+    "fmla v18.4s, v19.4s, v12.4s\n"
+    "mov v19.16b, v14.16b\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "fmla v17.4s, v21.4s, v12.4s\n"
+    "ldr q26, [x8, x19]\n"
+    "fmla v1.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v20.4s, v25.4s, v7.4s\n"
+    "fmla v16.4s, v25.4s, v5.4s\n"
+    "fmla v0.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "str q23, [x23]\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "ldr q28, [%[inptr0], x21]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr q30, [x12, %[input_col_stride1]]\n"
+    "fmla v24.4s, v29.4s, v4.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v16.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "str q24, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v1.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "ldr q27, [x11, x15]\n"
+    "fmla v3.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "fmla v22.4s, v29.4s, v12.4s\n"
+    "ldr q23, [x10, x17]\n"
+    "fmla v13.4s, v26.4s, v7.4s\n"
+    "fmla v2.4s, v26.4s, v10.4s\n"
+    "fmla v3.4s, v26.4s, v8.4s\n"
+    "fmla v17.4s, v26.4s, v11.4s\n"
+    "fmla v0.4s, v30.4s, v5.4s\n"
+    "ldr q24, [x9, x19]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "ldr q29, [x8, x21]\n"
+    "fmla v3.4s, v28.4s, v10.4s\n"
+    "ldr q14, [x12, x15]\n"
+    "fmla v20.4s, v27.4s, v4.4s\n"
+    "add x8, x8, #16\n"
+    "fmla v0.4s, v27.4s, v7.4s\n"
+    "prfm pldl1keep, [x8, #64]\n"
+    "fmla v1.4s, v27.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x28]\n"
+    "str q20, [x24]\n"
+    "fmla v15.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v6.4s\n"
+    "ldr q25, [x11, x17]\n"
+    "fmla v19.4s, v27.4s, v9.4s\n"
+    "ldr q30, [x10, x19]\n"
+    "fmla v16.4s, v23.4s, v4.4s\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "fmla v2.4s, v23.4s, v5.4s\n"
+    "fmla v15.4s, v23.4s, v10.4s\n"
+    "fmla v18.4s, v23.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v6.4s\n"
+    "str q16, [x23, %[output_col_stride1]]\n"
+    "fmla v19.4s, v23.4s, v11.4s\n"
+    "fmla v22.4s, v23.4s, v9.4s\n"
+    "ldr q26, [x9, x21]\n"
+    "fmla v21.4s, v23.4s, v12.4s\n"
+    "ldr q27, [x12, x17]\n"
+    "fmla v13.4s, v24.4s, v4.4s\n"
+    "ldr q20, [x11, x19]\n"
+    "fmla v2.4s, v24.4s, v7.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v3.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "str q13, [%[outptr0], x26]\n"
+    "fmla v18.4s, v24.4s, v10.4s\n"
+    "fmla v17.4s, v24.4s, v8.4s\n"
+    "ldr q23, [x10, x21]\n"
+    "fmla v22.4s, v24.4s, v11.4s\n"
+    "ldr q24, [x12, x19]\n"
+    "fmla v3.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x28]\n"
+    "fmla v17.4s, v29.4s, v10.4s\n"
+    "ldr q16, [x11, x21]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "add x10, x10, #16\n"
+    "fmla v15.4s, v14.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v19.4s, v14.4s, v6.4s\n"
+    "ldr q13, [x12, x21]\n"
+    "str q0, [x25]\n"
+    "fmla v1.4s, v25.4s, v4.4s\n"
+    "fmla v15.4s, v25.4s, v7.4s\n"
+    "ldr q14, [%[wbptr]]\n"
+    "fmla v18.4s, v25.4s, v5.4s\n"
+    "add x11, x11, #16\n"
+    "str q1, [x24, %[output_col_stride1]]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "fmla v22.4s, v25.4s, v6.4s\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "fmla v21.4s, v25.4s, v9.4s\n"
+    "ldr q29, [%[inptr0]]\n"
+    "fmla v2.4s, v30.4s, v4.4s\n"
+    "ldr q28, [x8]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "add x12, x12, #16\n"
+    "fmla v17.4s, v30.4s, v5.4s\n"
+    "fmla v19.4s, v30.4s, v10.4s\n"
+    "str q2, [x23, x26]\n"
+    "fmla v22.4s, v30.4s, v8.4s\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr q9, [%[wbptr], #64]\n"
+    "fmla v3.4s, v26.4s, v4.4s\n"
+    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v17.4s, v26.4s, v7.4s\n"
+    "ldr q25, [x9]\n"
+    "fmla v22.4s, v26.4s, v10.4s\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "str q3, [%[outptr0], x27]\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v19.4s, v27.4s, v5.4s\n"
+    "ldr q26, [x8, %[input_col_stride1]]\n"
+    "fmla v21.4s, v27.4s, v6.4s\n"
+    "ldr q27, [%[inptr0], x15]\n"
+    "str q15, [x25, %[output_col_stride1]]\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v19.4s, v20.4s, v7.4s\n"
+    "ldr q15, [x10]\n"
+    "fmla v22.4s, v20.4s, v5.4s\n"
+    "ldr q6, [%[wbptr], #112]\n"
+    "str q18, [x24, x26]\n"
+    "fmla v21.4s, v20.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "ldr q18, [x9, %[input_col_stride1]]\n"
+    "fmla v22.4s, v23.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v21.4s, v23.4s, v10.4s\n"
+    "ldr q8, [%[wbptr], #80]\n"
+    "str q17, [x23, x27]\n"
+    "fmla v19.4s, v24.4s, v4.4s\n"
+    "fmla v22.4s, v16.4s, v4.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v21.4s, v24.4s, v5.4s\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "str q19, [x25, x26]\n"
+    "mov v17.16b, v14.16b\n"
+    "str q22, [x24, x27]\n"
+    "mov v23.16b, v14.16b\n"
+    "fmla v21.4s, v16.4s, v7.4s\n"
+    "ldr q5, [%[wbptr], #128]\n"
+    "mov v24.16b, v14.16b\n"
+    "add x24, x24, #16\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v16.16b, v14.16b\n"
+    "fmla v21.4s, v13.4s, v4.4s\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "mov v13.16b, v14.16b\n"
+    "mov v0.16b, v14.16b\n"
+    "mov v1.16b, v14.16b\n"
+    "mov v2.16b, v14.16b\n"
+    "str q21, [x25, x27]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr q4, [%[wbptr], #144]\n"
+    "add x25, x25, #16\n"
+    "fmla v17.4s, v29.4s, v12.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "fmla v17.4s, v28.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x16]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr q22, [x8, x15]\n"
+    "fmla v24.4s, v30.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "ldr q29, [%[inptr0], x17]\n"
+    "fmla v23.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x11, #64]\n"
+    "fmla v20.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x10, x28]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "ldr q25, [x11]\n"
+    "fmla v23.4s, v26.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x18]\n"
+    "fmla v17.4s, v26.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v16.4s, v26.4s, v12.4s\n"
+    "ldr q28, [x10, %[input_col_stride1]]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, #64]\n"
+    "fmla v17.4s, v27.4s, v10.4s\n"
+    "prfm pldl1keep, [x11, x28]\n"
+    "fmla v13.4s, v27.4s, v12.4s\n"
+    "ldr q19, [x9, x15]\n"
+    "fmla v23.4s, v15.4s, v6.4s\n"
+    "prfm pldl1keep, [x10, x16]\n"
+    "fmla v20.4s, v15.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v0.4s, v15.4s, v12.4s\n"
+    "ldr q21, [x8, x17]\n"
+    "fmla v17.4s, v18.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x20]\n"
+    "fmla v23.4s, v18.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v24.4s, v18.4s, v6.4s\n"
+    "prfm pldl1keep, [x12, x28]\n"
+    "fmla v20.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "fmla v16.4s, v18.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x18]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "ldr q27, [%[inptr0], x19]\n"
+    "fmla v17.4s, v22.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x20]\n"
+    "fmla v23.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x8, x22]\n"
+    "fmla v24.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x16]\n"
+    "fmla v16.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x18]\n"
+    "fmla v13.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x20]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr q18, [x12]\n"
+    "fmla v24.4s, v29.4s, v10.4s\n"
+    "prfm pldl1keep, [x9, x22]\n"
+    "fmla v13.4s, v29.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x18]\n"
+    "fmla v3.4s, v29.4s, v12.4s\n"
+    "ldr q22, [x11, %[input_col_stride1]]\n"
+    "fmla v20.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x20]\n"
+    "fmla v0.4s, v25.4s, v9.4s\n"
+    "ldr q25, [x10, x15]\n"
+    "fmla v23.4s, v28.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, x22]\n"
+    "fmla v20.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x20]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x22]\n"
+    "fmla v0.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x22]\n"
+    "fmla v1.4s, v28.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v17.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v23.4s, v19.4s, v7.4s\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v10.4s\n"
+    "fmla v16.4s, v19.4s, v8.4s\n"
+    "str q17, [%[outptr0]]\n"
+    "mov v15.16b, v14.16b\n"
+    "fmla v13.4s, v19.4s, v6.4s\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v15.4s, v28.4s, v12.4s\n"
+    "ldr q29, [x9, x17]\n"
+    "fmla v2.4s, v19.4s, v9.4s\n"
+    "fmla v24.4s, v21.4s, v7.4s\n"
+    "fmla v16.4s, v21.4s, v10.4s\n"
+    "fmla v13.4s, v21.4s, v8.4s\n"
+    "fmla v3.4s, v21.4s, v9.4s\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v18.16b, v14.16b\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "fmla v13.4s, v27.4s, v10.4s\n"
+    "fmla v20.4s, v22.4s, v5.4s\n"
+    "fmla v18.4s, v19.4s, v12.4s\n"
+    "ldr q26, [x8, x19]\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "ldr q28, [%[inptr0], x21]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v1.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v20.4s, v25.4s, v7.4s\n"
+    "fmla v16.4s, v25.4s, v5.4s\n"
+    "fmla v17.4s, v21.4s, v12.4s\n"
+    "ldr q30, [x12, %[input_col_stride1]]\n"
+    "str q23, [x23]\n"
+    "mov v19.16b, v14.16b\n"
+    "fmla v0.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "mov v22.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "fmla v24.4s, v29.4s, v4.4s\n"
+    "fmla v16.4s, v29.4s, v7.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v1.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v3.4s, v29.4s, v6.4s\n"
+    "str q24, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "ldr q27, [x11, x15]\n"
+    "fmla v22.4s, v29.4s, v12.4s\n"
+    "ldr q23, [x10, x17]\n"
+    "fmla v13.4s, v26.4s, v7.4s\n"
+    "fmla v2.4s, v26.4s, v10.4s\n"
+    "fmla v3.4s, v26.4s, v8.4s\n"
+    "fmla v17.4s, v26.4s, v11.4s\n"
+    "fmla v0.4s, v30.4s, v5.4s\n"
+    "ldr q24, [x9, x19]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "ldr q29, [x8, x21]\n"
+    "fmla v3.4s, v28.4s, v10.4s\n"
+    "ldr q14, [x12, x15]\n"
+    "fmla v20.4s, v27.4s, v4.4s\n"
+    "add x8, x8, #16\n"
+    "fmla v0.4s, v27.4s, v7.4s\n"
+    "fmla v1.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v6.4s\n"
+    "str q20, [x24]\n"
+    "fmla v19.4s, v27.4s, v9.4s\n"
+    "fmla v16.4s, v23.4s, v4.4s\n"
+    "ldr q25, [x11, x17]\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "ldr q30, [x10, x19]\n"
+    "fmla v2.4s, v23.4s, v5.4s\n"
+    "fmla v15.4s, v23.4s, v10.4s\n"
+    "str q16, [x23, %[output_col_stride1]]\n"
+    "fmla v18.4s, v23.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v6.4s\n"
+    "ldr q26, [x9, x21]\n"
+    "fmla v19.4s, v23.4s, v11.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v22.4s, v23.4s, v9.4s\n"
+    "fmla v21.4s, v23.4s, v12.4s\n"
+    "fmla v13.4s, v24.4s, v4.4s\n"
+    "ldr q27, [x12, x17]\n"
+    "fmla v2.4s, v24.4s, v7.4s\n"
+    "ldr q20, [x11, x19]\n"
+    "fmla v3.4s, v24.4s, v5.4s\n"
+    "fmla v18.4s, v24.4s, v10.4s\n"
+    "str q13, [%[outptr0], x26]\n"
+    "fmla v17.4s, v24.4s, v8.4s\n"
+    "fmla v22.4s, v24.4s, v11.4s\n"
+    "ldr q23, [x10, x21]\n"
+    "fmla v3.4s, v29.4s, v7.4s\n"
+    "ldr q24, [x12, x19]\n"
+    "fmla v17.4s, v29.4s, v10.4s\n"
+    "ldr q16, [x11, x21]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "add x10, x10, #16\n"
+    "fmla v15.4s, v14.4s, v5.4s\n"
+    "add x11, x11, #16\n"
+    "fmla v19.4s, v14.4s, v6.4s\n"
+    "ldr q13, [x12, x21]\n"
+    "str q0, [x25]\n"
+    "fmla v1.4s, v25.4s, v4.4s\n"
+    "fmla v15.4s, v25.4s, v7.4s\n"
+    "add x12, x12, #16\n"
+    "fmla v18.4s, v25.4s, v5.4s\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "str q1, [x24, %[output_col_stride1]]\n"
+    "fmla v22.4s, v25.4s, v6.4s\n"
+    "fmla v21.4s, v25.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v4.4s\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "fmla v17.4s, v30.4s, v5.4s\n"
+    "fmla v19.4s, v30.4s, v10.4s\n"
+    "fmla v22.4s, v30.4s, v8.4s\n"
+    "str q2, [x23, x26]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "fmla v3.4s, v26.4s, v4.4s\n"
+    "fmla v17.4s, v26.4s, v7.4s\n"
+    "fmla v22.4s, v26.4s, v10.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v19.4s, v27.4s, v5.4s\n"
+    "fmla v21.4s, v27.4s, v6.4s\n"
+    "str q3, [%[outptr0], x27]\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "str q15, [x25, %[output_col_stride1]]\n"
+    "fmla v22.4s, v20.4s, v5.4s\n"
+    "fmla v19.4s, v20.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "str q18, [x24, x26]\n"
+    "fmla v21.4s, v20.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "fmla v22.4s, v23.4s, v7.4s\n"
+    "fmla v19.4s, v24.4s, v4.4s\n"
+    "fmla v21.4s, v23.4s, v10.4s\n"
+    "str q17, [x23, x27]\n"
+    "fmla v22.4s, v16.4s, v4.4s\n"
+    "str q19, [x25, x26]\n"
+    "add x23, x23, #16\n"
+    "fmla v21.4s, v24.4s, v5.4s\n"
+    "str q22, [x24, x27]\n"
+    "add x24, x24, #16\n"
+    "fmla v21.4s, v16.4s, v7.4s\n"
+    "fmla v21.4s, v13.4s, v4.4s\n"
+    "str q21, [x25, x27]\n"
+    "add x25, x25, #16\n"
+    "4:\n"
+    "cbz x13, 7f\n"
+    "ldr s14, [%[wbptr]]\n"
+    "mov v17.16b, v14.16b\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "mov v23.16b, v14.16b\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "mov v24.16b, v14.16b\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "mov v20.16b, v14.16b\n"
+    "ldr s9, [%[wbptr], #16]\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr s8, [%[wbptr], #20]\n"
+    "mov v13.16b, v14.16b\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "mov v0.16b, v14.16b\n"
+    "ldr s6, [%[wbptr], #28]\n"
+    "mov v1.16b, v14.16b\n"
+    "ldr s5, [%[wbptr], #32]\n"
+    "mov v2.16b, v14.16b\n"
+    "ldr s4, [%[wbptr], #36]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr s29, [%[inptr0]]\n"
+    "fmla v17.4s, v29.4s, v12.4s\n"
+    "ldr s28, [x8]\n"
+    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
+    "subs x13, x13, #1\n"
+    "ldr s25, [x9]\n"
+    "ldr s26, [x8, %[input_col_stride1]]\n"
+    "ldr s27, [%[inptr0], x15]\n"
+    "ldr s15, [x10]\n"
+    "ldr s18, [x9, %[input_col_stride1]]\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x8, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "prfm pldl1keep, [x8, x28]\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "prfm pldl1keep, [x9, x28]\n"
+    "beq 6f\n"
+    "5:\n"
+    "fmla v17.4s, v28.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x16]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr s22, [x8, x15]\n"
+    "fmla v24.4s, v30.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "ldr s29, [%[inptr0], x17]\n"
+    "fmla v23.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x11, #64]\n"
+    "fmla v20.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x10, x28]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "ldr s25, [x11]\n"
+    "fmla v23.4s, v26.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x18]\n"
+    "fmla v17.4s, v26.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v16.4s, v26.4s, v12.4s\n"
+    "ldr s28, [x10, %[input_col_stride1]]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, #64]\n"
+    "fmla v17.4s, v27.4s, v10.4s\n"
+    "prfm pldl1keep, [x11, x28]\n"
+    "fmla v13.4s, v27.4s, v12.4s\n"
+    "ldr s19, [x9, x15]\n"
+    "fmla v23.4s, v15.4s, v6.4s\n"
+    "prfm pldl1keep, [x10, x16]\n"
+    "fmla v20.4s, v15.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v0.4s, v15.4s, v12.4s\n"
+    "ldr s21, [x8, x17]\n"
+    "fmla v17.4s, v18.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x20]\n"
+    "fmla v23.4s, v18.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v24.4s, v18.4s, v6.4s\n"
+    "prfm pldl1keep, [x12, x28]\n"
+    "fmla v20.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "fmla v16.4s, v18.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x18]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "ldr s27, [%[inptr0], x19]\n"
+    "fmla v17.4s, v22.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x20]\n"
+    "fmla v23.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x8, x22]\n"
+    "fmla v24.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x16]\n"
+    "fmla v16.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x18]\n"
+    "fmla v13.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x20]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr s18, [x12]\n"
+    "fmla v24.4s, v29.4s, v10.4s\n"
+    "prfm pldl1keep, [x9, x22]\n"
+    "fmla v13.4s, v29.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x18]\n"
+    "fmla v3.4s, v29.4s, v12.4s\n"
+    "ldr s22, [x11, %[input_col_stride1]]\n"
+    "fmla v20.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x20]\n"
+    "fmla v0.4s, v25.4s, v9.4s\n"
+    "ldr s25, [x10, x15]\n"
+    "fmla v23.4s, v28.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, x22]\n"
+    "fmla v20.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x20]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x22]\n"
+    "fmla v0.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x22]\n"
+    "fmla v1.4s, v28.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v23.4s, v19.4s, v7.4s\n"
+    "subs x13, x13, #1\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v10.4s\n"
+    "str s17, [%[outptr0]]\n"
+    "mov v15.16b, v14.16b\n"
+    "fmla v16.4s, v19.4s, v8.4s\n"
+    "fmla v13.4s, v19.4s, v6.4s\n"
+    "fmla v15.4s, v28.4s, v12.4s\n"
+    "ldr s29, [x9, x17]\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v2.4s, v19.4s, v9.4s\n"
+    "fmla v24.4s, v21.4s, v7.4s\n"
+    "fmla v16.4s, v21.4s, v10.4s\n"
+    "fmla v13.4s, v21.4s, v8.4s\n"
+    "fmla v3.4s, v21.4s, v9.4s\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v18.16b, v14.16b\n"
+    "fmla v20.4s, v22.4s, v5.4s\n"
+    "fmla v13.4s, v27.4s, v10.4s\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "mov v17.16b, v14.16b\n"
+    "fmla v18.4s, v19.4s, v12.4s\n"
+    "mov v19.16b, v14.16b\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "fmla v17.4s, v21.4s, v12.4s\n"
+    "ldr s26, [x8, x19]\n"
+    "fmla v1.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v9.4s\n"
+    "mov v22.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v20.4s, v25.4s, v7.4s\n"
+    "fmla v16.4s, v25.4s, v5.4s\n"
+    "fmla v0.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "str s23, [x23]\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "ldr s28, [%[inptr0], x21]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr s30, [x12, %[input_col_stride1]]\n"
+    "fmla v24.4s, v29.4s, v4.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v16.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x28]\n"
+    "str s24, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v1.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "ldr s27, [x11, x15]\n"
+    "fmla v3.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "fmla v22.4s, v29.4s, v12.4s\n"
+    "ldr s23, [x10, x17]\n"
+    "fmla v13.4s, v26.4s, v7.4s\n"
+    "fmla v2.4s, v26.4s, v10.4s\n"
+    "fmla v3.4s, v26.4s, v8.4s\n"
+    "fmla v17.4s, v26.4s, v11.4s\n"
+    "fmla v0.4s, v30.4s, v5.4s\n"
+    "ldr s24, [x9, x19]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "ldr s29, [x8, x21]\n"
+    "fmla v3.4s, v28.4s, v10.4s\n"
+    "ldr s14, [x12, x15]\n"
+    "fmla v20.4s, v27.4s, v4.4s\n"
+    "add x8, x8, #4\n"
+    "fmla v0.4s, v27.4s, v7.4s\n"
+    "prfm pldl1keep, [x8, #64]\n"
+    "fmla v1.4s, v27.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x28]\n"
+    "str s20, [x24]\n"
+    "fmla v15.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v6.4s\n"
+    "ldr s25, [x11, x17]\n"
+    "fmla v19.4s, v27.4s, v9.4s\n"
+    "ldr s30, [x10, x19]\n"
+    "fmla v16.4s, v23.4s, v4.4s\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "fmla v2.4s, v23.4s, v5.4s\n"
+    "fmla v15.4s, v23.4s, v10.4s\n"
+    "fmla v18.4s, v23.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v6.4s\n"
+    "str s16, [x23, %[output_col_stride1]]\n"
+    "fmla v19.4s, v23.4s, v11.4s\n"
+    "fmla v22.4s, v23.4s, v9.4s\n"
+    "ldr s26, [x9, x21]\n"
+    "fmla v21.4s, v23.4s, v12.4s\n"
+    "ldr s27, [x12, x17]\n"
+    "fmla v13.4s, v24.4s, v4.4s\n"
+    "ldr s20, [x11, x19]\n"
+    "fmla v2.4s, v24.4s, v7.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v3.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "str s13, [%[outptr0], x26]\n"
+    "fmla v18.4s, v24.4s, v10.4s\n"
+    "fmla v17.4s, v24.4s, v8.4s\n"
+    "ldr s23, [x10, x21]\n"
+    "fmla v22.4s, v24.4s, v11.4s\n"
+    "ldr s24, [x12, x19]\n"
+    "fmla v3.4s, v29.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x28]\n"
+    "fmla v17.4s, v29.4s, v10.4s\n"
+    "ldr s16, [x11, x21]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "add x10, x10, #4\n"
+    "fmla v15.4s, v14.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v19.4s, v14.4s, v6.4s\n"
+    "ldr s13, [x12, x21]\n"
+    "str s0, [x25]\n"
+    "fmla v1.4s, v25.4s, v4.4s\n"
+    "fmla v15.4s, v25.4s, v7.4s\n"
+    "ldr s14, [%[wbptr]]\n"
+    "fmla v18.4s, v25.4s, v5.4s\n"
+    "add x11, x11, #4\n"
+    "str s1, [x24, %[output_col_stride1]]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "fmla v22.4s, v25.4s, v6.4s\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "fmla v21.4s, v25.4s, v9.4s\n"
+    "ldr s29, [%[inptr0]]\n"
+    "fmla v2.4s, v30.4s, v4.4s\n"
+    "ldr s28, [x8]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "add x12, x12, #4\n"
+    "fmla v17.4s, v30.4s, v5.4s\n"
+    "fmla v19.4s, v30.4s, v10.4s\n"
+    "str s2, [x23, x26]\n"
+    "fmla v22.4s, v30.4s, v8.4s\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr s9, [%[wbptr], #16]\n"
+    "fmla v3.4s, v26.4s, v4.4s\n"
+    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v17.4s, v26.4s, v7.4s\n"
+    "ldr s25, [x9]\n"
+    "fmla v22.4s, v26.4s, v10.4s\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "str s3, [%[outptr0], x27]\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v19.4s, v27.4s, v5.4s\n"
+    "ldr s26, [x8, %[input_col_stride1]]\n"
+    "fmla v21.4s, v27.4s, v6.4s\n"
+    "ldr s27, [%[inptr0], x15]\n"
+    "str s15, [x25, %[output_col_stride1]]\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v19.4s, v20.4s, v7.4s\n"
+    "ldr s15, [x10]\n"
+    "fmla v22.4s, v20.4s, v5.4s\n"
+    "ldr s6, [%[wbptr], #28]\n"
+    "str s18, [x24, x26]\n"
+    "fmla v21.4s, v20.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "ldr s18, [x9, %[input_col_stride1]]\n"
+    "fmla v22.4s, v23.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v21.4s, v23.4s, v10.4s\n"
+    "ldr s8, [%[wbptr], #20]\n"
+    "str s17, [x23, x27]\n"
+    "fmla v19.4s, v24.4s, v4.4s\n"
+    "fmla v22.4s, v16.4s, v4.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v21.4s, v24.4s, v5.4s\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "str s19, [x25, x26]\n"
+    "mov v17.16b, v14.16b\n"
+    "str s22, [x24, x27]\n"
+    "mov v23.16b, v14.16b\n"
+    "fmla v21.4s, v16.4s, v7.4s\n"
+    "ldr s5, [%[wbptr], #32]\n"
+    "mov v24.16b, v14.16b\n"
+    "add x24, x24, #4\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v16.16b, v14.16b\n"
+    "fmla v21.4s, v13.4s, v4.4s\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "mov v13.16b, v14.16b\n"
+    "mov v0.16b, v14.16b\n"
+    "mov v1.16b, v14.16b\n"
+    "mov v2.16b, v14.16b\n"
+    "str s21, [x25, x27]\n"
+    "mov v3.16b, v14.16b\n"
+    "ldr s4, [%[wbptr], #36]\n"
+    "add x25, x25, #4\n"
+    "fmla v17.4s, v29.4s, v12.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "fmla v17.4s, v28.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x16]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr s22, [x8, x15]\n"
+    "fmla v24.4s, v30.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "ldr s29, [%[inptr0], x17]\n"
+    "fmla v23.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x11, #64]\n"
+    "fmla v20.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x10, x28]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "ldr s25, [x11]\n"
+    "fmla v23.4s, v26.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [x8, x18]\n"
+    "fmla v17.4s, v26.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x20]\n"
+    "fmla v16.4s, v26.4s, v12.4s\n"
+    "ldr s28, [x10, %[input_col_stride1]]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, #64]\n"
+    "fmla v17.4s, v27.4s, v10.4s\n"
+    "prfm pldl1keep, [x11, x28]\n"
+    "fmla v13.4s, v27.4s, v12.4s\n"
+    "ldr s19, [x9, x15]\n"
+    "fmla v23.4s, v15.4s, v6.4s\n"
+    "prfm pldl1keep, [x10, x16]\n"
+    "fmla v20.4s, v15.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v0.4s, v15.4s, v12.4s\n"
+    "ldr s21, [x8, x17]\n"
+    "fmla v17.4s, v18.4s, v5.4s\n"
+    "prfm pldl1keep, [x8, x20]\n"
+    "fmla v23.4s, v18.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x22]\n"
+    "fmla v24.4s, v18.4s, v6.4s\n"
+    "prfm pldl1keep, [x12, x28]\n"
+    "fmla v20.4s, v18.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "fmla v16.4s, v18.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x18]\n"
+    "fmla v1.4s, v18.4s, v12.4s\n"
+    "ldr s27, [%[inptr0], x19]\n"
+    "fmla v17.4s, v22.4s, v7.4s\n"
+    "prfm pldl1keep, [x9, x20]\n"
+    "fmla v23.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x8, x22]\n"
+    "fmla v24.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x16]\n"
+    "fmla v16.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x11, x18]\n"
+    "fmla v13.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x20]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "ldr s18, [x12]\n"
+    "fmla v24.4s, v29.4s, v10.4s\n"
+    "prfm pldl1keep, [x9, x22]\n"
+    "fmla v13.4s, v29.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x18]\n"
+    "fmla v3.4s, v29.4s, v12.4s\n"
+    "ldr s22, [x11, %[input_col_stride1]]\n"
+    "fmla v20.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x20]\n"
+    "fmla v0.4s, v25.4s, v9.4s\n"
+    "ldr s25, [x10, x15]\n"
+    "fmla v23.4s, v28.4s, v5.4s\n"
+    "prfm pldl1keep, [x10, x22]\n"
+    "fmla v20.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x12, x20]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "prfm pldl1keep, [x11, x22]\n"
+    "fmla v0.4s, v28.4s, v11.4s\n"
+    "prfm pldl1keep, [x12, x22]\n"
+    "fmla v1.4s, v28.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v17.4s, v19.4s, v4.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v23.4s, v19.4s, v7.4s\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v10.4s\n"
+    "fmla v16.4s, v19.4s, v8.4s\n"
+    "str s17, [%[outptr0]]\n"
+    "mov v15.16b, v14.16b\n"
+    "fmla v13.4s, v19.4s, v6.4s\n"
+    "fmla v1.4s, v19.4s, v11.4s\n"
+    "fmla v15.4s, v28.4s, v12.4s\n"
+    "ldr s29, [x9, x17]\n"
+    "fmla v2.4s, v19.4s, v9.4s\n"
+    "fmla v24.4s, v21.4s, v7.4s\n"
+    "fmla v16.4s, v21.4s, v10.4s\n"
+    "fmla v13.4s, v21.4s, v8.4s\n"
+    "fmla v3.4s, v21.4s, v9.4s\n"
+    "fmla v0.4s, v18.4s, v6.4s\n"
+    "mov v18.16b, v14.16b\n"
+    "fmla v2.4s, v21.4s, v11.4s\n"
+    "fmla v13.4s, v27.4s, v10.4s\n"
+    "fmla v20.4s, v22.4s, v5.4s\n"
+    "fmla v18.4s, v19.4s, v12.4s\n"
+    "ldr s26, [x8, x19]\n"
+    "fmla v3.4s, v27.4s, v11.4s\n"
+    "ldr s28, [%[inptr0], x21]\n"
+    "fmla v0.4s, v22.4s, v8.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v1.4s, v22.4s, v6.4s\n"
+    "fmla v15.4s, v22.4s, v9.4s\n"
+    "mov v17.16b, v14.16b\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v20.4s, v25.4s, v7.4s\n"
+    "fmla v16.4s, v25.4s, v5.4s\n"
+    "fmla v17.4s, v21.4s, v12.4s\n"
+    "ldr s30, [x12, %[input_col_stride1]]\n"
+    "str s23, [x23]\n"
+    "mov v19.16b, v14.16b\n"
+    "fmla v0.4s, v25.4s, v10.4s\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "fmla v2.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "mov v22.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "fmla v24.4s, v29.4s, v4.4s\n"
+    "fmla v16.4s, v29.4s, v7.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v1.4s, v29.4s, v10.4s\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v3.4s, v29.4s, v6.4s\n"
+    "str s24, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "ldr s27, [x11, x15]\n"
+    "fmla v22.4s, v29.4s, v12.4s\n"
+    "ldr s23, [x10, x17]\n"
+    "fmla v13.4s, v26.4s, v7.4s\n"
+    "fmla v2.4s, v26.4s, v10.4s\n"
+    "fmla v3.4s, v26.4s, v8.4s\n"
+    "fmla v17.4s, v26.4s, v11.4s\n"
+    "fmla v0.4s, v30.4s, v5.4s\n"
+    "ldr s24, [x9, x19]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "ldr s29, [x8, x21]\n"
+    "fmla v3.4s, v28.4s, v10.4s\n"
+    "ldr s14, [x12, x15]\n"
+    "fmla v20.4s, v27.4s, v4.4s\n"
+    "add x8, x8, #4\n"
+    "fmla v0.4s, v27.4s, v7.4s\n"
+    "fmla v1.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v6.4s\n"
+    "str s20, [x24]\n"
+    "fmla v19.4s, v27.4s, v9.4s\n"
+    "fmla v16.4s, v23.4s, v4.4s\n"
+    "ldr s25, [x11, x17]\n"
+    "fmla v1.4s, v23.4s, v7.4s\n"
+    "ldr s30, [x10, x19]\n"
+    "fmla v2.4s, v23.4s, v5.4s\n"
+    "fmla v15.4s, v23.4s, v10.4s\n"
+    "str s16, [x23, %[output_col_stride1]]\n"
+    "fmla v18.4s, v23.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v6.4s\n"
+    "ldr s26, [x9, x21]\n"
+    "fmla v19.4s, v23.4s, v11.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v22.4s, v23.4s, v9.4s\n"
+    "fmla v21.4s, v23.4s, v12.4s\n"
+    "fmla v13.4s, v24.4s, v4.4s\n"
+    "ldr s27, [x12, x17]\n"
+    "fmla v2.4s, v24.4s, v7.4s\n"
+    "ldr s20, [x11, x19]\n"
+    "fmla v3.4s, v24.4s, v5.4s\n"
+    "fmla v18.4s, v24.4s, v10.4s\n"
+    "str s13, [%[outptr0], x26]\n"
+    "fmla v17.4s, v24.4s, v8.4s\n"
+    "fmla v22.4s, v24.4s, v11.4s\n"
+    "ldr s23, [x10, x21]\n"
+    "fmla v3.4s, v29.4s, v7.4s\n"
+    "ldr s24, [x12, x19]\n"
+    "fmla v17.4s, v29.4s, v10.4s\n"
+    "ldr s16, [x11, x21]\n"
+    "fmla v0.4s, v14.4s, v4.4s\n"
+    "add x10, x10, #4\n"
+    "fmla v15.4s, v14.4s, v5.4s\n"
+    "add x11, x11, #4\n"
+    "fmla v19.4s, v14.4s, v6.4s\n"
+    "ldr s13, [x12, x21]\n"
+    "str s0, [x25]\n"
+    "fmla v1.4s, v25.4s, v4.4s\n"
+    "fmla v15.4s, v25.4s, v7.4s\n"
+    "add x12, x12, #4\n"
+    "fmla v18.4s, v25.4s, v5.4s\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "str s1, [x24, %[output_col_stride1]]\n"
+    "fmla v22.4s, v25.4s, v6.4s\n"
+    "fmla v21.4s, v25.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v4.4s\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "fmla v17.4s, v30.4s, v5.4s\n"
+    "fmla v19.4s, v30.4s, v10.4s\n"
+    "fmla v22.4s, v30.4s, v8.4s\n"
+    "str s2, [x23, x26]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "fmla v3.4s, v26.4s, v4.4s\n"
+    "fmla v17.4s, v26.4s, v7.4s\n"
+    "fmla v22.4s, v26.4s, v10.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v19.4s, v27.4s, v5.4s\n"
+    "fmla v21.4s, v27.4s, v6.4s\n"
+    "str s3, [%[outptr0], x27]\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "str s15, [x25, %[output_col_stride1]]\n"
+    "fmla v22.4s, v20.4s, v5.4s\n"
+    "fmla v19.4s, v20.4s, v7.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "str s18, [x24, x26]\n"
+    "fmla v21.4s, v20.4s, v8.4s\n"
+    "fmla v17.4s, v23.4s, v4.4s\n"
+    "fmla v22.4s, v23.4s, v7.4s\n"
+    "fmla v19.4s, v24.4s, v4.4s\n"
+    "fmla v21.4s, v23.4s, v10.4s\n"
+    "str s17, [x23, x27]\n"
+    "fmla v22.4s, v16.4s, v4.4s\n"
+    "str s19, [x25, x26]\n"
+    "add x23, x23, #4\n"
+    "fmla v21.4s, v24.4s, v5.4s\n"
+    "str s22, [x24, x27]\n"
+    "add x24, x24, #4\n"
+    "fmla v21.4s, v16.4s, v7.4s\n"
+    "fmla v21.4s, v13.4s, v4.4s\n"
+    "str s21, [x25, x27]\n"
+    "add x25, x25, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
   );
+}
 
-  int channels_remaining = n_channels;
-  if (channels_remaining >= 4 && same_strides)
-  {
-    int c4_rem = channels_remaining / 4;
-    channels_remaining %= 4;
-    const int prefetch_depth = 8;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *inptrs[6][6],
+  float *outptrs[4][4]
+)
+{
+  __asm __volatile(
+    "mov x27, xzr\n"
+    "mov x28, xzr\n"
+    "and x15, %[n_channels], #3\n"
+    "lsr x16, %[n_channels], #2\n"
+    "cbz x16, 4f\n"
+    "1:\n"
+    "ldr q13, [%[wbptr]]\n"
+    "ldr x17, [%[inptrs], 0]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "mov v22.16b, v13.16b\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "mov v19.16b, v13.16b\n"
+    "ldr q9, [%[wbptr], #64]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q8, [%[wbptr], #80]\n"
+    "mov v14.16b, v13.16b\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "mov v0.16b, v13.16b\n"
+    "ldr q6, [%[wbptr], #112]\n"
+    "mov v1.16b, v13.16b\n"
+    "ldr q5, [%[wbptr], #128]\n"
+    "mov v2.16b, v13.16b\n"
+    "ldr q4, [%[wbptr], #144]\n"
+    "ldr q29, [x17, x27]\n"
+    "ldr x18, [%[inptrs], 48]\n"
+    "fmla v18.4s, v29.4s, v12.4s\n"
+    "ldr x17, [%[inptrs], 8]\n"
+    "ldr q27, [x18, x27]\n"
+    "ldr x19, [%[inptrs], 96]\n"
+    "ldr q28, [x17, x27]\n"
+    "ldr x18, [%[inptrs], 56]\n"
+    "ldr q25, [x19, x27]\n"
+    "ldr x17, [%[inptrs], 16]\n"
+    "ldr q16, [x18, x27]\n"
+    "ldr x20, [%[inptrs], 144]\n"
+    "ldr q15, [x17, x27]\n"
+    "ldr x19, [%[inptrs], 104]\n"
+    "ldr q21, [x20, x27]\n"
+    "subs x16, x16, #1\n"
+    "ldr q29, [x19, x27]\n"
+    "beq 3f\n"
+    "2:\n"
+    "mov v3.16b, v13.16b\n"
+    "ldr x18, [%[inptrs], 64]\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "ldr x17, [%[inptrs], 24]\n"
+    "fmla v22.4s, v27.4s, v12.4s\n"
+    "ldr q30, [x18, x27]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr x21, [%[inptrs], 192]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 152]\n"
+    "fmla v18.4s, v28.4s, v11.4s\n"
+    "ldr q24, [x17, x27]\n"
+    "fmla v22.4s, v25.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 112]\n"
+    "fmla v23.4s, v16.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 72]\n"
+    "fmla v17.4s, v16.4s, v12.4s\n"
+    "ldr x17, [%[inptrs], 32]\n"
+    "fmla v18.4s, v25.4s, v6.4s\n"
+    "ldr q31, [x21, x27]\n"
+    "fmla v22.4s, v16.4s, v11.4s\n"
+    "ldr x22, [%[inptrs], 240]\n"
+    "fmla v23.4s, v15.4s, v11.4s\n"
+    "ldr x21, [%[inptrs], 200]\n"
+    "fmla v14.4s, v15.4s, v12.4s\n"
+    "ldr x23, [%[outptrs], 0]\n"
+    "fmla v18.4s, v16.4s, v8.4s\n"
+    "ldr q25, [x20, x27]\n"
+    "fmla v22.4s, v21.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v19.4s, v21.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 32]\n"
+    "fmla v0.4s, v21.4s, v12.4s\n"
+    "ldr q21, [x19, x27]\n"
+    "fmla v18.4s, v15.4s, v10.4s\n"
+    "ldr q20, [x18, x27]\n"
+    "fmla v22.4s, v29.4s, v8.4s\n"
+    "ldr x19, [%[inptrs], 120]\n"
+    "fmla v23.4s, v29.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 80]\n"
+    "fmla v19.4s, v29.4s, v11.4s\n"
+    "ldr x25, [%[outptrs], 64]\n"
+    "fmla v18.4s, v29.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 96]\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "ldr q26, [x17, x27]\n"
+    "fmla v22.4s, v30.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 40]\n"
+    "fmla v23.4s, v30.4s, v8.4s\n"
+    "subs x16, x16, #1\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "fmla v14.4s, v30.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr q27, [x22, x27]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "ldr x22, [%[inptrs], 248]\n"
+    "fmla v23.4s, v24.4s, v10.4s\n"
+    "fmla v19.4s, v31.4s, v6.4s\n"
+    "fmla v14.4s, v24.4s, v11.4s\n"
+    "ldr q30, [x21, x27]\n"
+    "fmla v0.4s, v31.4s, v9.4s\n"
+    "ldr q24, [x20, x27]\n"
+    "fmla v22.4s, v25.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 208]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "fmla v1.4s, v25.4s, v9.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v21.4s, v4.4s\n"
+    "fmla v22.4s, v21.4s, v7.4s\n"
+    "fmla v23.4s, v21.4s, v5.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v14.4s, v21.4s, v6.4s\n"
+    "fmla v17.4s, v21.4s, v8.4s\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "str q18, [x23, x28]\n"
+    "mov v16.16b, v13.16b\n"
+    "fmla v2.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 8]\n"
+    "fmla v23.4s, v20.4s, v7.4s\n"
+    "fmla v14.4s, v20.4s, v8.4s\n"
+    "fmla v16.4s, v25.4s, v12.4s\n"
+    "ldr q25, [x19, x27]\n"
+    "fmla v17.4s, v20.4s, v10.4s\n"
+    "ldr x19, [%[inptrs], 128]\n"
+    "fmla v2.4s, v20.4s, v11.4s\n"
+    "fmla v3.4s, v20.4s, v9.4s\n"
+    "fmla v14.4s, v26.4s, v10.4s\n"
+    "fmla v0.4s, v27.4s, v6.4s\n"
+    "mov v15.16b, v13.16b\n"
+    "fmla v19.4s, v30.4s, v5.4s\n"
+    "fmla v1.4s, v30.4s, v6.4s\n"
+    "fmla v16.4s, v30.4s, v9.4s\n"
+    "fmla v3.4s, v26.4s, v11.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "ldr q27, [x17, x27]\n"
+    "fmla v0.4s, v30.4s, v8.4s\n"
+    "ldr q28, [x22, x27]\n"
+    "fmla v22.4s, v24.4s, v4.4s\n"
+    "ldr x18, [%[inptrs], 88]\n"
+    "fmla v19.4s, v24.4s, v7.4s\n"
+    "ldr x22, [%[inptrs], 256]\n"
+    "fmla v17.4s, v24.4s, v5.4s\n"
+    "ldr x17, [%[inptrs], 0]\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v8.4s\n"
+    "str q22, [x24, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v2.4s, v24.4s, v6.4s\n"
+    "ldr x24, [%[outptrs], 40]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "fmla v18.4s, v20.4s, v12.4s\n"
+    "ldr q22, [x21, x27]\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 216]\n"
+    "fmla v17.4s, v25.4s, v7.4s\n"
+    "fmla v14.4s, v25.4s, v5.4s\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v8.4s\n"
+    "fmla v3.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "str q23, [x23, x28]\n"
+    "mov v21.16b, v13.16b\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "fmla v2.4s, v29.4s, v10.4s\n"
+    "fmla v21.4s, v24.4s, v12.4s\n"
+    "ldr q30, [x20, x27]\n"
+    "fmla v3.4s, v29.4s, v8.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "ldr q31, [x19, x27]\n"
+    "fmla v0.4s, v28.4s, v5.4s\n"
+    "ldr x19, [%[inptrs], 136]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "ldr q26, [x18, x27]\n"
+    "fmla v3.4s, v27.4s, v10.4s\n"
+    "ldr q23, [x22, x27]\n"
+    "fmla v19.4s, v22.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 264]\n"
+    "fmla v0.4s, v22.4s, v7.4s\n"
+    "ldr x18, [%[inptrs], 48]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "fmla v16.4s, v22.4s, v8.4s\n"
+    "fmla v15.4s, v22.4s, v6.4s\n"
+    "fmla v21.4s, v22.4s, v9.4s\n"
+    "str q19, [x25, x28]\n"
+    "mov v24.16b, v13.16b\n"
+    "mov v20.16b, v13.16b\n"
+    "ldr q27, [x21, x27]\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 224]\n"
+    "fmla v24.4s, v25.4s, v12.4s\n"
+    "ldr q28, [x20, x27]\n"
+    "fmla v1.4s, v30.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v2.4s, v30.4s, v5.4s\n"
+    "ldr x25, [%[outptrs], 72]\n"
+    "str q17, [x24, x28]\n"
+    "fmla v16.4s, v30.4s, v10.4s\n"
+    "fmla v15.4s, v30.4s, v8.4s\n"
+    "ldr q22, [x19, x27]\n"
+    "fmla v18.4s, v30.4s, v6.4s\n"
+    "ldr x24, [%[outptrs], 48]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr x19, [%[inptrs], 96]\n"
+    "fmla v24.4s, v30.4s, v9.4s\n"
+    "fmla v20.4s, v30.4s, v12.4s\n"
+    "fmla v14.4s, v31.4s, v4.4s\n"
+    "ldr q30, [x22, x27]\n"
+    "fmla v2.4s, v31.4s, v7.4s\n"
+    "ldr q19, [x21, x27]\n"
+    "fmla v3.4s, v31.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 272]\n"
+    "fmla v15.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%[inptrs], 232]\n"
+    "str q14, [x23, x28]\n"
+    "fmla v18.4s, v31.4s, v8.4s\n"
+    "fmla v24.4s, v31.4s, v11.4s\n"
+    "ldr q31, [x20, x27]\n"
+    "fmla v3.4s, v26.4s, v7.4s\n"
+    "ldr q17, [x22, x27]\n"
+    "fmla v0.4s, v23.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 280]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr q14, [x21, x27]\n"
+    "fmla v16.4s, v23.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 24]\n"
+    "fmla v21.4s, v23.4s, v6.4s\n"
+    "ldr q26, [x22, x27]\n"
+    "str q0, [x26, x28]\n"
+    "fmla v1.4s, v27.4s, v4.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "ldr q13, [%[wbptr]]\n"
+    "fmla v16.4s, v27.4s, v7.4s\n"
+    "ldr x26, [%[outptrs], 104]\n"
+    "fmla v21.4s, v27.4s, v8.4s\n"
+    "add x27, x27, #16\n"
+    "str q1, [x25, x28]\n"
+    "fmla v24.4s, v27.4s, v6.4s\n"
+    "fmla v20.4s, v27.4s, v9.4s\n"
+    "ldr q12, [%[wbptr], #16]\n"
+    "fmla v2.4s, v28.4s, v4.4s\n"
+    "ldr q29, [x17, x27]\n"
+    "fmla v15.4s, v28.4s, v7.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "fmla v18.4s, v28.4s, v5.4s\n"
+    "ldr x25, [%[outptrs], 80]\n"
+    "fmla v21.4s, v28.4s, v10.4s\n"
+    "ldr x17, [%[inptrs], 8]\n"
+    "str q2, [x24, x28]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "fmla v20.4s, v28.4s, v11.4s\n"
+    "ldr q9, [%[wbptr], #64]\n"
+    "fmla v3.4s, v22.4s, v4.4s\n"
+    "ldr q28, [x17, x27]\n"
+    "fmla v18.4s, v22.4s, v7.4s\n"
+    "ldr q25, [x19, x27]\n"
+    "fmla v24.4s, v22.4s, v10.4s\n"
+    "ldr x24, [%[outptrs], 56]\n"
+    "fmla v16.4s, v30.4s, v4.4s\n"
+    "ldr q11, [%[wbptr], #32]\n"
+    "str q3, [x23, x28]\n"
+    "fmla v21.4s, v30.4s, v5.4s\n"
+    "fmla v20.4s, v30.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 56]\n"
+    "fmla v15.4s, v19.4s, v4.4s\n"
+    "ldr x17, [%[inptrs], 16]\n"
+    "str q16, [x26, x28]\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v21.4s, v19.4s, v7.4s\n"
+    "ldr q16, [x18, x27]\n"
+    "fmla v20.4s, v19.4s, v8.4s\n"
+    "ldr q6, [%[wbptr], #112]\n"
+    "str q15, [x25, x28]\n"
+    "fmla v18.4s, v31.4s, v4.4s\n"
+    "fmla v24.4s, v31.4s, v7.4s\n"
+    "ldr q15, [x17, x27]\n"
+    "fmla v21.4s, v17.4s, v4.4s\n"
+    "ldr x25, [%[outptrs], 88]\n"
+    "fmla v20.4s, v31.4s, v10.4s\n"
+    "ldr q8, [%[wbptr], #80]\n"
+    "str q18, [x24, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v24.4s, v14.4s, v4.4s\n"
+    "ldr x26, [%[outptrs], 112]\n"
+    "mov v22.16b, v13.16b\n"
+    "ldr x20, [%[inptrs], 144]\n"
+    "str q21, [x26, x28]\n"
+    "fmla v20.4s, v17.4s, v5.4s\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr q10, [%[wbptr], #48]\n"
+    "str q24, [x25, x28]\n"
+    "mov v19.16b, v13.16b\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q21, [x20, x27]\n"
+    "fmla v20.4s, v14.4s, v7.4s\n"
+    "ldr q5, [%[wbptr], #128]\n"
+    "mov v14.16b, v13.16b\n"
+    "ldr x26, [%[outptrs], 120]\n"
+    "mov v0.16b, v13.16b\n"
+    "ldr x19, [%[inptrs], 104]\n"
+    "mov v1.16b, v13.16b\n"
+    "mov v2.16b, v13.16b\n"
+    "fmla v20.4s, v26.4s, v4.4s\n"
+    "ldr q7, [%[wbptr], #96]\n"
+    "fmla v18.4s, v29.4s, v12.4s\n"
+    "ldr q29, [x19, x27]\n"
+    "str q20, [x26, x28]\n"
+    "ldr q4, [%[wbptr], #144]\n"
+    "add x28, x28, #16\n"
+    "bne 2b\n"
+    "3:\n"
+    "mov v3.16b, v13.16b\n"
+    "ldr x18, [%[inptrs], 64]\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "ldr x17, [%[inptrs], 24]\n"
+    "fmla v22.4s, v27.4s, v12.4s\n"
+    "ldr q30, [x18, x27]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr x21, [%[inptrs], 192]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 152]\n"
+    "fmla v18.4s, v28.4s, v11.4s\n"
+    "ldr q24, [x17, x27]\n"
+    "fmla v22.4s, v25.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 112]\n"
+    "fmla v23.4s, v16.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 72]\n"
+    "fmla v17.4s, v16.4s, v12.4s\n"
+    "ldr x17, [%[inptrs], 32]\n"
+    "fmla v18.4s, v25.4s, v6.4s\n"
+    "ldr q31, [x21, x27]\n"
+    "fmla v22.4s, v16.4s, v11.4s\n"
+    "ldr x22, [%[inptrs], 240]\n"
+    "fmla v23.4s, v15.4s, v11.4s\n"
+    "ldr x21, [%[inptrs], 200]\n"
+    "fmla v14.4s, v15.4s, v12.4s\n"
+    "ldr x23, [%[outptrs], 0]\n"
+    "fmla v18.4s, v16.4s, v8.4s\n"
+    "ldr q25, [x20, x27]\n"
+    "fmla v22.4s, v21.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v19.4s, v21.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 32]\n"
+    "fmla v0.4s, v21.4s, v12.4s\n"
+    "ldr q21, [x19, x27]\n"
+    "fmla v18.4s, v15.4s, v10.4s\n"
+    "ldr q20, [x18, x27]\n"
+    "fmla v22.4s, v29.4s, v8.4s\n"
+    "ldr x19, [%[inptrs], 120]\n"
+    "fmla v23.4s, v29.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 80]\n"
+    "fmla v19.4s, v29.4s, v11.4s\n"
+    "ldr x25, [%[outptrs], 64]\n"
+    "fmla v18.4s, v29.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 96]\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "ldr q26, [x17, x27]\n"
+    "fmla v22.4s, v30.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 40]\n"
+    "fmla v23.4s, v30.4s, v8.4s\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "fmla v14.4s, v30.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "mov v16.16b, v13.16b\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "fmla v19.4s, v31.4s, v6.4s\n"
+    "fmla v0.4s, v31.4s, v9.4s\n"
+    "mov v15.16b, v13.16b\n"
+    "fmla v23.4s, v24.4s, v10.4s\n"
+    "fmla v14.4s, v24.4s, v11.4s\n"
+    "ldr q27, [x22, x27]\n"
+    "fmla v22.4s, v25.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 248]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "fmla v1.4s, v25.4s, v9.4s\n"
+    "fmla v16.4s, v25.4s, v12.4s\n"
+    "ldr q30, [x21, x27]\n"
+    "fmla v18.4s, v21.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 208]\n"
+    "fmla v22.4s, v21.4s, v7.4s\n"
+    "fmla v23.4s, v21.4s, v5.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v17.4s, v21.4s, v8.4s\n"
+    "fmla v14.4s, v21.4s, v6.4s\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "str q18, [x23, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v2.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 8]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "ldr q24, [x20, x27]\n"
+    "fmla v23.4s, v20.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v17.4s, v20.4s, v10.4s\n"
+    "fmla v14.4s, v20.4s, v8.4s\n"
+    "fmla v2.4s, v20.4s, v11.4s\n"
+    "fmla v3.4s, v20.4s, v9.4s\n"
+    "fmla v18.4s, v20.4s, v12.4s\n"
+    "ldr q25, [x19, x27]\n"
+    "fmla v0.4s, v27.4s, v6.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v14.4s, v26.4s, v10.4s\n"
+    "ldr x19, [%[inptrs], 128]\n"
+    "fmla v3.4s, v26.4s, v11.4s\n"
+    "ldr q27, [x17, x27]\n"
+    "fmla v19.4s, v30.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 88]\n"
+    "fmla v0.4s, v30.4s, v8.4s\n"
+    "fmla v1.4s, v30.4s, v6.4s\n"
+    "fmla v16.4s, v30.4s, v9.4s\n"
+    "ldr q28, [x22, x27]\n"
+    "fmla v22.4s, v24.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 256]\n"
+    "fmla v19.4s, v24.4s, v7.4s\n"
+    "fmla v17.4s, v24.4s, v5.4s\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v8.4s\n"
+    "fmla v2.4s, v24.4s, v6.4s\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "str q22, [x24, x28]\n"
+    "mov v21.16b, v13.16b\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 40]\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v17.4s, v25.4s, v7.4s\n"
+    "fmla v21.4s, v24.4s, v12.4s\n"
+    "ldr q22, [x21, x27]\n"
+    "fmla v14.4s, v25.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 216]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v8.4s\n"
+    "str q23, [x23, x28]\n"
+    "mov v24.16b, v13.16b\n"
+    "mov v20.16b, v13.16b\n"
+    "ldr x23, [%[outptrs], 16]\n"
+    "fmla v3.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "fmla v24.4s, v25.4s, v12.4s\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr q30, [x20, x27]\n"
+    "fmla v2.4s, v29.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmla v3.4s, v29.4s, v8.4s\n"
+    "fmla v0.4s, v28.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "ldr q31, [x19, x27]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "ldr q26, [x18, x27]\n"
+    "fmla v19.4s, v22.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 136]\n"
+    "fmla v3.4s, v27.4s, v10.4s\n"
+    "ldr q23, [x22, x27]\n"
+    "fmla v0.4s, v22.4s, v7.4s\n"
+    "ldr x22, [%[inptrs], 264]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "fmla v16.4s, v22.4s, v8.4s\n"
+    "str q19, [x25, x28]\n"
+    "fmla v15.4s, v22.4s, v6.4s\n"
+    "fmla v21.4s, v22.4s, v9.4s\n"
+    "ldr q27, [x21, x27]\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "ldr q28, [x20, x27]\n"
+    "fmla v1.4s, v30.4s, v7.4s\n"
+    "ldr x21, [%[inptrs], 224]\n"
+    "fmla v2.4s, v30.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v16.4s, v30.4s, v10.4s\n"
+    "ldr x25, [%[outptrs], 72]\n"
+    "str q17, [x24, x28]\n"
+    "fmla v15.4s, v30.4s, v8.4s\n"
+    "fmla v18.4s, v30.4s, v6.4s\n"
+    "ldr q22, [x19, x27]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr x24, [%[outptrs], 48]\n"
+    "fmla v24.4s, v30.4s, v9.4s\n"
+    "fmla v20.4s, v30.4s, v12.4s\n"
+    "fmla v14.4s, v31.4s, v4.4s\n"
+    "ldr q30, [x22, x27]\n"
+    "fmla v2.4s, v31.4s, v7.4s\n"
+    "ldr q19, [x21, x27]\n"
+    "fmla v3.4s, v31.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 272]\n"
+    "fmla v15.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%[inptrs], 232]\n"
+    "str q14, [x23, x28]\n"
+    "fmla v18.4s, v31.4s, v8.4s\n"
+    "fmla v24.4s, v31.4s, v11.4s\n"
+    "ldr q31, [x20, x27]\n"
+    "fmla v3.4s, v26.4s, v7.4s\n"
+    "ldr q17, [x22, x27]\n"
+    "fmla v0.4s, v23.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 280]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr q14, [x21, x27]\n"
+    "fmla v16.4s, v23.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 24]\n"
+    "fmla v21.4s, v23.4s, v6.4s\n"
+    "ldr q26, [x22, x27]\n"
+    "str q0, [x26, x28]\n"
+    "fmla v1.4s, v27.4s, v4.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 104]\n"
+    "fmla v16.4s, v27.4s, v7.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v21.4s, v27.4s, v8.4s\n"
+    "fmla v24.4s, v27.4s, v6.4s\n"
+    "str q1, [x25, x28]\n"
+    "fmla v20.4s, v27.4s, v9.4s\n"
+    "fmla v2.4s, v28.4s, v4.4s\n"
+    "ldr x25, [%[outptrs], 80]\n"
+    "fmla v15.4s, v28.4s, v7.4s\n"
+    "fmla v18.4s, v28.4s, v5.4s\n"
+    "fmla v21.4s, v28.4s, v10.4s\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "fmla v20.4s, v28.4s, v11.4s\n"
+    "fmla v3.4s, v22.4s, v4.4s\n"
+    "str q2, [x24, x28]\n"
+    "fmla v16.4s, v30.4s, v4.4s\n"
+    "fmla v18.4s, v22.4s, v7.4s\n"
+    "ldr x24, [%[outptrs], 56]\n"
+    "fmla v24.4s, v22.4s, v10.4s\n"
+    "fmla v21.4s, v30.4s, v5.4s\n"
+    "str q3, [x23, x28]\n"
+    "fmla v20.4s, v30.4s, v6.4s\n"
+    "str q16, [x26, x28]\n"
+    "fmla v15.4s, v19.4s, v4.4s\n"
+    "fmla v18.4s, v31.4s, v4.4s\n"
+    "ldr x26, [%[outptrs], 112]\n"
+    "fmla v21.4s, v19.4s, v7.4s\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v8.4s\n"
+    "str q15, [x25, x28]\n"
+    "str q18, [x24, x28]\n"
+    "ldr x25, [%[outptrs], 88]\n"
+    "fmla v24.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v17.4s, v4.4s\n"
+    "fmla v20.4s, v31.4s, v10.4s\n"
+    "str q21, [x26, x28]\n"
+    "fmla v20.4s, v17.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 120]\n"
+    "fmla v24.4s, v14.4s, v4.4s\n"
+    "fmla v20.4s, v14.4s, v7.4s\n"
+    "str q24, [x25, x28]\n"
+    "fmla v20.4s, v26.4s, v4.4s\n"
+    "str q20, [x26, x28]\n"
+    "add x28, x28, #16\n"
+    "4:\n"
+    "cbz x15, 7f\n"
+    "ldr s13, [%[wbptr]]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "mov v22.16b, v13.16b\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "mov v19.16b, v13.16b\n"
+    "ldr s9, [%[wbptr], #16]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr s8, [%[wbptr], #20]\n"
+    "mov v14.16b, v13.16b\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "mov v0.16b, v13.16b\n"
+    "ldr s6, [%[wbptr], #28]\n"
+    "mov v1.16b, v13.16b\n"
+    "ldr s5, [%[wbptr], #32]\n"
+    "mov v2.16b, v13.16b\n"
+    "ldr s4, [%[wbptr], #36]\n"
+    "ldr x17, [%[inptrs], 0]\n"
+    "ldr x18, [%[inptrs], 48]\n"
+    "ldr x19, [%[inptrs], 96]\n"
+    "ldr x20, [%[inptrs], 144]\n"
+    "subs x15, x15, #1\n"
+    "ldr s29, [x17, x27]\n"
+    "fmla v18.4s, v29.4s, v12.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "ldr s25, [x19, x27]\n"
+    "ldr x17, [%[inptrs], 8]\n"
+    "ldr s21, [x20, x27]\n"
+    "ldr x18, [%[inptrs], 56]\n"
+    "ldr s28, [x17, x27]\n"
+    "ldr x19, [%[inptrs], 104]\n"
+    "ldr s16, [x18, x27]\n"
+    "ldr x17, [%[inptrs], 16]\n"
+    "ldr s29, [x19, x27]\n"
+    "ldr s15, [x17, x27]\n"
+    "beq 6f\n"
+    "5:\n"
+    "mov v3.16b, v13.16b\n"
+    "ldr x18, [%[inptrs], 64]\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "ldr x17, [%[inptrs], 24]\n"
+    "fmla v22.4s, v27.4s, v12.4s\n"
+    "ldr s30, [x18, x27]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr x21, [%[inptrs], 192]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 152]\n"
+    "fmla v18.4s, v28.4s, v11.4s\n"
+    "ldr s24, [x17, x27]\n"
+    "fmla v22.4s, v25.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 112]\n"
+    "fmla v23.4s, v16.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 72]\n"
+    "fmla v17.4s, v16.4s, v12.4s\n"
+    "ldr x17, [%[inptrs], 32]\n"
+    "fmla v18.4s, v25.4s, v6.4s\n"
+    "ldr s31, [x21, x27]\n"
+    "fmla v22.4s, v16.4s, v11.4s\n"
+    "ldr x22, [%[inptrs], 240]\n"
+    "fmla v23.4s, v15.4s, v11.4s\n"
+    "ldr x21, [%[inptrs], 200]\n"
+    "fmla v14.4s, v15.4s, v12.4s\n"
+    "ldr x23, [%[outptrs], 0]\n"
+    "fmla v18.4s, v16.4s, v8.4s\n"
+    "ldr s25, [x20, x27]\n"
+    "fmla v22.4s, v21.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v19.4s, v21.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 32]\n"
+    "fmla v0.4s, v21.4s, v12.4s\n"
+    "ldr s21, [x19, x27]\n"
+    "fmla v18.4s, v15.4s, v10.4s\n"
+    "ldr s20, [x18, x27]\n"
+    "fmla v22.4s, v29.4s, v8.4s\n"
+    "ldr x19, [%[inptrs], 120]\n"
+    "fmla v23.4s, v29.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 80]\n"
+    "fmla v19.4s, v29.4s, v11.4s\n"
+    "ldr x25, [%[outptrs], 64]\n"
+    "fmla v18.4s, v29.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 96]\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "ldr s26, [x17, x27]\n"
+    "fmla v22.4s, v30.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 40]\n"
+    "fmla v23.4s, v30.4s, v8.4s\n"
+    "subs x15, x15, #1\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "fmla v14.4s, v30.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "ldr s27, [x22, x27]\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "ldr x22, [%[inptrs], 248]\n"
+    "fmla v23.4s, v24.4s, v10.4s\n"
+    "fmla v19.4s, v31.4s, v6.4s\n"
+    "fmla v14.4s, v24.4s, v11.4s\n"
+    "ldr s30, [x21, x27]\n"
+    "fmla v0.4s, v31.4s, v9.4s\n"
+    "ldr s24, [x20, x27]\n"
+    "fmla v22.4s, v25.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 208]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "fmla v1.4s, v25.4s, v9.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v21.4s, v4.4s\n"
+    "fmla v22.4s, v21.4s, v7.4s\n"
+    "fmla v23.4s, v21.4s, v5.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v14.4s, v21.4s, v6.4s\n"
+    "fmla v17.4s, v21.4s, v8.4s\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "str s18, [x23, x28]\n"
+    "mov v16.16b, v13.16b\n"
+    "fmla v2.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 8]\n"
+    "fmla v23.4s, v20.4s, v7.4s\n"
+    "fmla v14.4s, v20.4s, v8.4s\n"
+    "fmla v16.4s, v25.4s, v12.4s\n"
+    "ldr s25, [x19, x27]\n"
+    "fmla v17.4s, v20.4s, v10.4s\n"
+    "ldr x19, [%[inptrs], 128]\n"
+    "fmla v2.4s, v20.4s, v11.4s\n"
+    "fmla v3.4s, v20.4s, v9.4s\n"
+    "fmla v14.4s, v26.4s, v10.4s\n"
+    "fmla v0.4s, v27.4s, v6.4s\n"
+    "mov v15.16b, v13.16b\n"
+    "fmla v19.4s, v30.4s, v5.4s\n"
+    "fmla v1.4s, v30.4s, v6.4s\n"
+    "fmla v16.4s, v30.4s, v9.4s\n"
+    "fmla v3.4s, v26.4s, v11.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "ldr s27, [x17, x27]\n"
+    "fmla v0.4s, v30.4s, v8.4s\n"
+    "ldr s28, [x22, x27]\n"
+    "fmla v22.4s, v24.4s, v4.4s\n"
+    "ldr x18, [%[inptrs], 88]\n"
+    "fmla v19.4s, v24.4s, v7.4s\n"
+    "ldr x22, [%[inptrs], 256]\n"
+    "fmla v17.4s, v24.4s, v5.4s\n"
+    "ldr x17, [%[inptrs], 0]\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v8.4s\n"
+    "str s22, [x24, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v2.4s, v24.4s, v6.4s\n"
+    "ldr x24, [%[outptrs], 40]\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "fmla v18.4s, v20.4s, v12.4s\n"
+    "ldr s22, [x21, x27]\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 216]\n"
+    "fmla v17.4s, v25.4s, v7.4s\n"
+    "fmla v14.4s, v25.4s, v5.4s\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v8.4s\n"
+    "fmla v3.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "str s23, [x23, x28]\n"
+    "mov v21.16b, v13.16b\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 16]\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "fmla v2.4s, v29.4s, v10.4s\n"
+    "fmla v21.4s, v24.4s, v12.4s\n"
+    "ldr s30, [x20, x27]\n"
+    "fmla v3.4s, v29.4s, v8.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "ldr s31, [x19, x27]\n"
+    "fmla v0.4s, v28.4s, v5.4s\n"
+    "ldr x19, [%[inptrs], 136]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "ldr s26, [x18, x27]\n"
+    "fmla v3.4s, v27.4s, v10.4s\n"
+    "ldr s23, [x22, x27]\n"
+    "fmla v19.4s, v22.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 264]\n"
+    "fmla v0.4s, v22.4s, v7.4s\n"
+    "ldr x18, [%[inptrs], 48]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "fmla v16.4s, v22.4s, v8.4s\n"
+    "fmla v15.4s, v22.4s, v6.4s\n"
+    "fmla v21.4s, v22.4s, v9.4s\n"
+    "str s19, [x25, x28]\n"
+    "mov v24.16b, v13.16b\n"
+    "mov v20.16b, v13.16b\n"
+    "ldr s27, [x21, x27]\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 224]\n"
+    "fmla v24.4s, v25.4s, v12.4s\n"
+    "ldr s28, [x20, x27]\n"
+    "fmla v1.4s, v30.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v2.4s, v30.4s, v5.4s\n"
+    "ldr x25, [%[outptrs], 72]\n"
+    "str s17, [x24, x28]\n"
+    "fmla v16.4s, v30.4s, v10.4s\n"
+    "fmla v15.4s, v30.4s, v8.4s\n"
+    "ldr s22, [x19, x27]\n"
+    "fmla v18.4s, v30.4s, v6.4s\n"
+    "ldr x24, [%[outptrs], 48]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr x19, [%[inptrs], 96]\n"
+    "fmla v24.4s, v30.4s, v9.4s\n"
+    "fmla v20.4s, v30.4s, v12.4s\n"
+    "fmla v14.4s, v31.4s, v4.4s\n"
+    "ldr s30, [x22, x27]\n"
+    "fmla v2.4s, v31.4s, v7.4s\n"
+    "ldr s19, [x21, x27]\n"
+    "fmla v3.4s, v31.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 272]\n"
+    "fmla v15.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%[inptrs], 232]\n"
+    "str s14, [x23, x28]\n"
+    "fmla v18.4s, v31.4s, v8.4s\n"
+    "fmla v24.4s, v31.4s, v11.4s\n"
+    "ldr s31, [x20, x27]\n"
+    "fmla v3.4s, v26.4s, v7.4s\n"
+    "ldr s17, [x22, x27]\n"
+    "fmla v0.4s, v23.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 280]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr s14, [x21, x27]\n"
+    "fmla v16.4s, v23.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 24]\n"
+    "fmla v21.4s, v23.4s, v6.4s\n"
+    "ldr s26, [x22, x27]\n"
+    "str s0, [x26, x28]\n"
+    "fmla v1.4s, v27.4s, v4.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "ldr s13, [%[wbptr]]\n"
+    "fmla v16.4s, v27.4s, v7.4s\n"
+    "ldr x26, [%[outptrs], 104]\n"
+    "fmla v21.4s, v27.4s, v8.4s\n"
+    "add x27, x27, #4\n"
+    "str s1, [x25, x28]\n"
+    "fmla v24.4s, v27.4s, v6.4s\n"
+    "fmla v20.4s, v27.4s, v9.4s\n"
+    "ldr s12, [%[wbptr], #4]\n"
+    "fmla v2.4s, v28.4s, v4.4s\n"
+    "ldr s29, [x17, x27]\n"
+    "fmla v15.4s, v28.4s, v7.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "fmla v18.4s, v28.4s, v5.4s\n"
+    "ldr x25, [%[outptrs], 80]\n"
+    "fmla v21.4s, v28.4s, v10.4s\n"
+    "ldr x17, [%[inptrs], 8]\n"
+    "str s2, [x24, x28]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "fmla v20.4s, v28.4s, v11.4s\n"
+    "ldr s9, [%[wbptr], #16]\n"
+    "fmla v3.4s, v22.4s, v4.4s\n"
+    "ldr s28, [x17, x27]\n"
+    "fmla v18.4s, v22.4s, v7.4s\n"
+    "ldr s25, [x19, x27]\n"
+    "fmla v24.4s, v22.4s, v10.4s\n"
+    "ldr x24, [%[outptrs], 56]\n"
+    "fmla v16.4s, v30.4s, v4.4s\n"
+    "ldr s11, [%[wbptr], #8]\n"
+    "str s3, [x23, x28]\n"
+    "fmla v21.4s, v30.4s, v5.4s\n"
+    "fmla v20.4s, v30.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 56]\n"
+    "fmla v15.4s, v19.4s, v4.4s\n"
+    "ldr x17, [%[inptrs], 16]\n"
+    "str s16, [x26, x28]\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v21.4s, v19.4s, v7.4s\n"
+    "ldr s16, [x18, x27]\n"
+    "fmla v20.4s, v19.4s, v8.4s\n"
+    "ldr s6, [%[wbptr], #28]\n"
+    "str s15, [x25, x28]\n"
+    "fmla v18.4s, v31.4s, v4.4s\n"
+    "fmla v24.4s, v31.4s, v7.4s\n"
+    "ldr s15, [x17, x27]\n"
+    "fmla v21.4s, v17.4s, v4.4s\n"
+    "ldr x25, [%[outptrs], 88]\n"
+    "fmla v20.4s, v31.4s, v10.4s\n"
+    "ldr s8, [%[wbptr], #20]\n"
+    "str s18, [x24, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v24.4s, v14.4s, v4.4s\n"
+    "ldr x26, [%[outptrs], 112]\n"
+    "mov v22.16b, v13.16b\n"
+    "ldr x20, [%[inptrs], 144]\n"
+    "str s21, [x26, x28]\n"
+    "fmla v20.4s, v17.4s, v5.4s\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr s10, [%[wbptr], #12]\n"
+    "str s24, [x25, x28]\n"
+    "mov v19.16b, v13.16b\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr s21, [x20, x27]\n"
+    "fmla v20.4s, v14.4s, v7.4s\n"
+    "ldr s5, [%[wbptr], #32]\n"
+    "mov v14.16b, v13.16b\n"
+    "ldr x26, [%[outptrs], 120]\n"
+    "mov v0.16b, v13.16b\n"
+    "ldr x19, [%[inptrs], 104]\n"
+    "mov v1.16b, v13.16b\n"
+    "mov v2.16b, v13.16b\n"
+    "fmla v20.4s, v26.4s, v4.4s\n"
+    "ldr s7, [%[wbptr], #24]\n"
+    "fmla v18.4s, v29.4s, v12.4s\n"
+    "ldr s29, [x19, x27]\n"
+    "str s20, [x26, x28]\n"
+    "ldr s4, [%[wbptr], #36]\n"
+    "add x28, x28, #4\n"
+    "bne 5b\n"
+    "6:\n"
+    "mov v3.16b, v13.16b\n"
+    "ldr x18, [%[inptrs], 64]\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "ldr x17, [%[inptrs], 24]\n"
+    "fmla v22.4s, v27.4s, v12.4s\n"
+    "ldr s30, [x18, x27]\n"
+    "fmla v23.4s, v28.4s, v12.4s\n"
+    "ldr x21, [%[inptrs], 192]\n"
+    "fmla v19.4s, v25.4s, v12.4s\n"
+    "ldr x20, [%[inptrs], 152]\n"
+    "fmla v18.4s, v28.4s, v11.4s\n"
+    "ldr s24, [x17, x27]\n"
+    "fmla v22.4s, v25.4s, v9.4s\n"
+    "ldr x19, [%[inptrs], 112]\n"
+    "fmla v23.4s, v16.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 72]\n"
+    "fmla v17.4s, v16.4s, v12.4s\n"
+    "ldr x17, [%[inptrs], 32]\n"
+    "fmla v18.4s, v25.4s, v6.4s\n"
+    "ldr s31, [x21, x27]\n"
+    "fmla v22.4s, v16.4s, v11.4s\n"
+    "ldr x22, [%[inptrs], 240]\n"
+    "fmla v23.4s, v15.4s, v11.4s\n"
+    "ldr x21, [%[inptrs], 200]\n"
+    "fmla v14.4s, v15.4s, v12.4s\n"
+    "ldr x23, [%[outptrs], 0]\n"
+    "fmla v18.4s, v16.4s, v8.4s\n"
+    "ldr s25, [x20, x27]\n"
+    "fmla v22.4s, v21.4s, v6.4s\n"
+    "ldr x20, [%[inptrs], 160]\n"
+    "fmla v19.4s, v21.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 32]\n"
+    "fmla v0.4s, v21.4s, v12.4s\n"
+    "ldr s21, [x19, x27]\n"
+    "fmla v18.4s, v15.4s, v10.4s\n"
+    "ldr s20, [x18, x27]\n"
+    "fmla v22.4s, v29.4s, v8.4s\n"
+    "ldr x19, [%[inptrs], 120]\n"
+    "fmla v23.4s, v29.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 80]\n"
+    "fmla v19.4s, v29.4s, v11.4s\n"
+    "ldr x25, [%[outptrs], 64]\n"
+    "fmla v18.4s, v29.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 96]\n"
+    "fmla v17.4s, v29.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "ldr s26, [x17, x27]\n"
+    "fmla v22.4s, v30.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v18.4s, v30.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 40]\n"
+    "fmla v23.4s, v30.4s, v8.4s\n"
+    "fmla v17.4s, v30.4s, v11.4s\n"
+    "fmla v14.4s, v30.4s, v9.4s\n"
+    "fmla v2.4s, v30.4s, v12.4s\n"
+    "mov v16.16b, v13.16b\n"
+    "fmla v3.4s, v24.4s, v12.4s\n"
+    "fmla v19.4s, v31.4s, v6.4s\n"
+    "fmla v0.4s, v31.4s, v9.4s\n"
+    "mov v15.16b, v13.16b\n"
+    "fmla v23.4s, v24.4s, v10.4s\n"
+    "fmla v14.4s, v24.4s, v11.4s\n"
+    "ldr s27, [x22, x27]\n"
+    "fmla v22.4s, v25.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 248]\n"
+    "fmla v19.4s, v25.4s, v8.4s\n"
+    "fmla v17.4s, v25.4s, v6.4s\n"
+    "fmla v0.4s, v25.4s, v11.4s\n"
+    "fmla v1.4s, v25.4s, v9.4s\n"
+    "fmla v16.4s, v25.4s, v12.4s\n"
+    "ldr s30, [x21, x27]\n"
+    "fmla v18.4s, v21.4s, v4.4s\n"
+    "ldr x21, [%[inptrs], 208]\n"
+    "fmla v22.4s, v21.4s, v7.4s\n"
+    "fmla v23.4s, v21.4s, v5.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v17.4s, v21.4s, v8.4s\n"
+    "fmla v14.4s, v21.4s, v6.4s\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "str s18, [x23, x28]\n"
+    "mov v18.16b, v13.16b\n"
+    "fmla v2.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 8]\n"
+    "fmla v15.4s, v21.4s, v12.4s\n"
+    "ldr s24, [x20, x27]\n"
+    "fmla v23.4s, v20.4s, v7.4s\n"
+    "ldr x20, [%[inptrs], 168]\n"
+    "fmla v17.4s, v20.4s, v10.4s\n"
+    "fmla v14.4s, v20.4s, v8.4s\n"
+    "fmla v2.4s, v20.4s, v11.4s\n"
+    "fmla v3.4s, v20.4s, v9.4s\n"
+    "fmla v18.4s, v20.4s, v12.4s\n"
+    "ldr s25, [x19, x27]\n"
+    "fmla v0.4s, v27.4s, v6.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v14.4s, v26.4s, v10.4s\n"
+    "ldr x19, [%[inptrs], 128]\n"
+    "fmla v3.4s, v26.4s, v11.4s\n"
+    "ldr s27, [x17, x27]\n"
+    "fmla v19.4s, v30.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 88]\n"
+    "fmla v0.4s, v30.4s, v8.4s\n"
+    "fmla v1.4s, v30.4s, v6.4s\n"
+    "fmla v16.4s, v30.4s, v9.4s\n"
+    "ldr s28, [x22, x27]\n"
+    "fmla v22.4s, v24.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 256]\n"
+    "fmla v19.4s, v24.4s, v7.4s\n"
+    "fmla v17.4s, v24.4s, v5.4s\n"
+    "fmla v0.4s, v24.4s, v10.4s\n"
+    "fmla v1.4s, v24.4s, v8.4s\n"
+    "fmla v2.4s, v24.4s, v6.4s\n"
+    "fmla v16.4s, v24.4s, v11.4s\n"
+    "str s22, [x24, x28]\n"
+    "mov v21.16b, v13.16b\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "ldr x24, [%[outptrs], 40]\n"
+    "fmla v23.4s, v25.4s, v4.4s\n"
+    "fmla v17.4s, v25.4s, v7.4s\n"
+    "fmla v21.4s, v24.4s, v12.4s\n"
+    "ldr s22, [x21, x27]\n"
+    "fmla v14.4s, v25.4s, v5.4s\n"
+    "ldr x21, [%[inptrs], 216]\n"
+    "fmla v1.4s, v25.4s, v10.4s\n"
+    "fmla v2.4s, v25.4s, v8.4s\n"
+    "str s23, [x23, x28]\n"
+    "mov v24.16b, v13.16b\n"
+    "mov v20.16b, v13.16b\n"
+    "ldr x23, [%[outptrs], 16]\n"
+    "fmla v3.4s, v25.4s, v6.4s\n"
+    "fmla v15.4s, v25.4s, v11.4s\n"
+    "fmla v18.4s, v25.4s, v9.4s\n"
+    "fmla v24.4s, v25.4s, v12.4s\n"
+    "fmla v14.4s, v29.4s, v7.4s\n"
+    "ldr s30, [x20, x27]\n"
+    "fmla v2.4s, v29.4s, v10.4s\n"
+    "ldr x20, [%[inptrs], 176]\n"
+    "fmla v3.4s, v29.4s, v8.4s\n"
+    "fmla v0.4s, v28.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v11.4s\n"
+    "ldr s31, [x19, x27]\n"
+    "fmla v16.4s, v28.4s, v6.4s\n"
+    "ldr s26, [x18, x27]\n"
+    "fmla v19.4s, v22.4s, v4.4s\n"
+    "ldr x19, [%[inptrs], 136]\n"
+    "fmla v3.4s, v27.4s, v10.4s\n"
+    "ldr s23, [x22, x27]\n"
+    "fmla v0.4s, v22.4s, v7.4s\n"
+    "ldr x22, [%[inptrs], 264]\n"
+    "fmla v1.4s, v22.4s, v5.4s\n"
+    "fmla v16.4s, v22.4s, v8.4s\n"
+    "str s19, [x25, x28]\n"
+    "fmla v15.4s, v22.4s, v6.4s\n"
+    "fmla v21.4s, v22.4s, v9.4s\n"
+    "ldr s27, [x21, x27]\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "ldr s28, [x20, x27]\n"
+    "fmla v1.4s, v30.4s, v7.4s\n"
+    "ldr x21, [%[inptrs], 224]\n"
+    "fmla v2.4s, v30.4s, v5.4s\n"
+    "ldr x20, [%[inptrs], 184]\n"
+    "fmla v16.4s, v30.4s, v10.4s\n"
+    "ldr x25, [%[outptrs], 72]\n"
+    "str s17, [x24, x28]\n"
+    "fmla v15.4s, v30.4s, v8.4s\n"
+    "fmla v18.4s, v30.4s, v6.4s\n"
+    "ldr s22, [x19, x27]\n"
+    "fmla v21.4s, v30.4s, v11.4s\n"
+    "ldr x24, [%[outptrs], 48]\n"
+    "fmla v24.4s, v30.4s, v9.4s\n"
+    "fmla v20.4s, v30.4s, v12.4s\n"
+    "fmla v14.4s, v31.4s, v4.4s\n"
+    "ldr s30, [x22, x27]\n"
+    "fmla v2.4s, v31.4s, v7.4s\n"
+    "ldr s19, [x21, x27]\n"
+    "fmla v3.4s, v31.4s, v5.4s\n"
+    "ldr x22, [%[inptrs], 272]\n"
+    "fmla v15.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%[inptrs], 232]\n"
+    "str s14, [x23, x28]\n"
+    "fmla v18.4s, v31.4s, v8.4s\n"
+    "fmla v24.4s, v31.4s, v11.4s\n"
+    "ldr s31, [x20, x27]\n"
+    "fmla v3.4s, v26.4s, v7.4s\n"
+    "ldr s17, [x22, x27]\n"
+    "fmla v0.4s, v23.4s, v4.4s\n"
+    "ldr x22, [%[inptrs], 280]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr s14, [x21, x27]\n"
+    "fmla v16.4s, v23.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 24]\n"
+    "fmla v21.4s, v23.4s, v6.4s\n"
+    "ldr s26, [x22, x27]\n"
+    "str s0, [x26, x28]\n"
+    "fmla v1.4s, v27.4s, v4.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 104]\n"
+    "fmla v16.4s, v27.4s, v7.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v21.4s, v27.4s, v8.4s\n"
+    "fmla v24.4s, v27.4s, v6.4s\n"
+    "str s1, [x25, x28]\n"
+    "fmla v20.4s, v27.4s, v9.4s\n"
+    "fmla v2.4s, v28.4s, v4.4s\n"
+    "ldr x25, [%[outptrs], 80]\n"
+    "fmla v15.4s, v28.4s, v7.4s\n"
+    "fmla v18.4s, v28.4s, v5.4s\n"
+    "fmla v21.4s, v28.4s, v10.4s\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "fmla v20.4s, v28.4s, v11.4s\n"
+    "fmla v3.4s, v22.4s, v4.4s\n"
+    "str s2, [x24, x28]\n"
+    "fmla v16.4s, v30.4s, v4.4s\n"
+    "fmla v18.4s, v22.4s, v7.4s\n"
+    "ldr x24, [%[outptrs], 56]\n"
+    "fmla v24.4s, v22.4s, v10.4s\n"
+    "fmla v21.4s, v30.4s, v5.4s\n"
+    "str s3, [x23, x28]\n"
+    "fmla v20.4s, v30.4s, v6.4s\n"
+    "str s16, [x26, x28]\n"
+    "fmla v15.4s, v19.4s, v4.4s\n"
+    "fmla v18.4s, v31.4s, v4.4s\n"
+    "ldr x26, [%[outptrs], 112]\n"
+    "fmla v21.4s, v19.4s, v7.4s\n"
+    "fmla v24.4s, v19.4s, v5.4s\n"
+    "fmla v20.4s, v19.4s, v8.4s\n"
+    "str s15, [x25, x28]\n"
+    "str s18, [x24, x28]\n"
+    "ldr x25, [%[outptrs], 88]\n"
+    "fmla v24.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v17.4s, v4.4s\n"
+    "fmla v20.4s, v31.4s, v10.4s\n"
+    "str s21, [x26, x28]\n"
+    "fmla v20.4s, v17.4s, v5.4s\n"
+    "ldr x26, [%[outptrs], 120]\n"
+    "fmla v24.4s, v14.4s, v4.4s\n"
+    "fmla v20.4s, v14.4s, v7.4s\n"
+    "str s24, [x25, x28]\n"
+    "fmla v20.4s, v26.4s, v4.4s\n"
+    "str s20, [x26, x28]\n"
+    "add x28, x28, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr)
+    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-    asm volatile (
-      "qW22 .req q0\n" "vW22 .req v0\n"
-      "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
-      "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
-      "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
-      "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
-      "qW21 .req q3\n" "vW21 .req v3\n"
-      "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
-      "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
-      "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
-      "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
-      "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
-      "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
-      "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
-      "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
-      "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
-      "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
-      "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
-      "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
-      "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
-      "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
-      "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
-      "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
-      "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
-      "qW33 .req q16\n" "vW33 .req v16\n"
-      "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
-      "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
-      "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
-      "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
-      "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
-      "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
-      "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
-      "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
-      "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
-      "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
-      "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
-      "qU23 .req q28\n" "qU52 .req q28\n"
-      "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x9, %[inptr0], %[input_row_stride]\n"
+    "add x28, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x16, %[outptr0], %[output_row_stride]\n"
+    "add x24, x9, %[input_row_stride]\n"
+    "add x25, x28, #64\n"
+    "add x23, x28, %[input_col_stride1]\n"
+    "add x26, x24, %[input_row_stride]\n"
+    "add x11, x23, #64\n"
+    "add x12, x23, %[input_col_stride1]\n"
+    "add x10, x26, %[input_row_stride]\n"
+    "add x13, x12, #64\n"
+    "add x14, x12, %[input_col_stride1]\n"
+    "add x27, x10, %[input_row_stride]\n"
+    "add x15, x14, #64\n"
+    "add x17, x16, %[output_row_stride]\n"
+    "add x18, x17, %[output_row_stride]\n"
+    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x21, %[n_channels], #3\n"
+    "add x20, x19, %[output_col_stride1]\n"
+    "lsr x22, %[n_channels], #2\n"
+    "cbz x22, 4f\n"
+    "1:\n"
+    "ldr q21, [%[wbptr]]\n"
+    "subs x22, x22, #1\n"
+    "mov v7.16b, v21.16b\n"
+    "ldr q20, [%[wbptr], #16]\n"
+    "mov v3.16b, v21.16b\n"
+    "ldr q14, [%[wbptr], #32]\n"
+    "mov v6.16b, v21.16b\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "mov v15.16b, v21.16b\n"
+    "ldr q17, [%[wbptr], #64]\n"
+    "mov v2.16b, v21.16b\n"
+    "ldr q12, [%[wbptr], #80]\n"
+    "mov v5.16b, v21.16b\n"
+    "ldr q11, [%[wbptr], #96]\n"
+    "mov v0.16b, v21.16b\n"
+    "ldr q10, [%[wbptr], #112]\n"
+    "mov v16.16b, v21.16b\n"
+    "ldr q9, [%[wbptr], #128]\n"
+    "mov v1.16b, v21.16b\n"
+    "ldr q8, [%[wbptr], #144]\n"
+    "mov v4.16b, v21.16b\n"
+    "ldr q22, [%[inptr0]]\n"
+    "fmla v7.4s, v22.4s, v20.4s\n"
+    "ldr q19, [x9]\n"
+    "fmla v3.4s, v19.4s, v20.4s\n"
+    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v6.4s, v23.4s, v20.4s\n"
+    "ldr q18, [x24]\n"
+    "fmla v7.4s, v19.4s, v17.4s\n"
+    "ldr q27, [x9, %[input_col_stride1]]\n"
+    "fmla v3.4s, v18.4s, v17.4s\n"
+    "ldr q28, [%[inptr0], x28]\n"
+    "fmla v15.4s, v18.4s, v20.4s\n"
+    "ldr q25, [x26]\n"
+    "fmla v7.4s, v23.4s, v14.4s\n"
+    "ldr q22, [x24, %[input_col_stride1]]\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x8]\n"
+    "fmla v7.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "prfm pldl1keep, [%[inptr0], x25]\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla v7.4s, v27.4s, v12.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "mov v18.16b, v21.16b\n"
+    "ldr q23, [x9, x28]\n"
+    "mov v19.16b, v21.16b\n"
+    "prfm pldl1keep, [x9, x25]\n"
+    "fmla v6.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x11]\n"
+    "fmla v2.4s, v27.4s, v20.4s\n"
+    "ldr q24, [%[inptr0], x23]\n"
+    "fmla v7.4s, v28.4s, v13.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v6.4s, v28.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla v5.4s, v28.4s, v20.4s\n"
+    "ldr q26, [x10]\n"
+    "fmla v3.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x25]\n"
+    "fmla v15.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x9, x11]\n"
+    "fmla v0.4s, v25.4s, v20.4s\n"
+    "ldr q25, [x26, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x13]\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, #64]\n"
+    "fmla v6.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla v15.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x25]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "prfm pldl1keep, [x24, x11]\n"
+    "fmla v16.4s, v22.4s, v20.4s\n"
+    "ldr q22, [x24, x28]\n"
+    "fmla v7.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x13]\n"
+    "fmla v3.4s, v23.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v6.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "prfm pldl1keep, [x26, x11]\n"
+    "fmla v1.4s, v23.4s, v20.4s\n"
+    "ldr q23, [x9, x23]\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x24, x13]\n"
+    "fmla v5.4s, v24.4s, v14.4s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "fmla v4.4s, v24.4s, v20.4s\n"
+    "ldr q24, [%[inptr0], x12]\n"
+    "fmla v15.4s, v26.4s, v10.4s\n"
+    "prfm pldl1keep, [x27, x25]\n"
+    "fmla v0.4s, v26.4s, v17.4s\n"
+    "ldr q29, [x27]\n"
+    "fmla v3.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x11]\n"
+    "fmla v15.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x26, x13]\n"
+    "fmla v2.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "prfm pldl1keep, [x27, x11]\n"
+    "fmla v16.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x10, x13]\n"
+    "fmla v18.4s, v25.4s, v20.4s\n"
+    "ldr q26, [x10, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, x15]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x27, x13]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x27, x15]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v5.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v22.4s, v14.4s\n"
+    "subs x22, x22, #1\n"
+    "fmla v1.4s, v22.4s, v17.4s\n"
+    "fmla v19.4s, v22.4s, v20.4s\n"
+    "mov v22.16b, v21.16b\n"
+    "fmla v6.4s, v23.4s, v11.4s\n"
+    "fmla v2.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v23.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v14.4s\n"
+    "fmla v4.4s, v23.4s, v17.4s\n"
+    "fmla v22.4s, v23.4s, v20.4s\n"
+    "ldr q27, [x26, x28]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "fmla v0.4s, v29.4s, v10.4s\n"
+    "mov v23.16b, v21.16b\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "mov v25.16b, v21.16b\n"
+    "mov v24.16b, v21.16b\n"
+    "fmla v15.4s, v26.4s, v9.4s\n"
+    "fmla v0.4s, v26.4s, v12.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v18.4s, v26.4s, v17.4s\n"
+    "fmla v3.4s, v27.4s, v8.4s\n"
+    "ldr q29, [x24, x23]\n"
+    "fmla v15.4s, v27.4s, v11.4s\n"
+    "fmla v2.4s, v27.4s, v9.4s\n"
+    "fmla v0.4s, v27.4s, v13.4s\n"
+    "fmla v16.4s, v27.4s, v12.4s\n"
+    "fmla v1.4s, v27.4s, v10.4s\n"
+    "fmla v18.4s, v27.4s, v14.4s\n"
+    "fmla v19.4s, v27.4s, v17.4s\n"
+    "fmla v23.4s, v27.4s, v20.4s\n"
+    "fmla v6.4s, v29.4s, v8.4s\n"
+    "ldr q28, [x9, x12]\n"
+    "fmla v2.4s, v29.4s, v11.4s\n"
+    "fmla v5.4s, v29.4s, v9.4s\n"
+    "fmla v16.4s, v29.4s, v13.4s\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "fmla v4.4s, v29.4s, v10.4s\n"
+    "fmla v19.4s, v29.4s, v14.4s\n"
+    "fmla v22.4s, v29.4s, v17.4s\n"
+    "fmla v25.4s, v29.4s, v20.4s\n"
+    "fmla v5.4s, v28.4s, v11.4s\n"
+    "ldr q21, [%[inptr0], x14]\n"
+    "fmla v1.4s, v28.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v22.4s, v28.4s, v14.4s\n"
+    "ldr q26, [x27, %[input_col_stride1]]\n"
+    "fmla v0.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x8]\n"
+    "fmla v4.4s, v21.4s, v13.4s\n"
+    "ldr q21, [x10, x28]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr q29, [x26, x23]\n"
+    "fmla v15.4s, v21.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x25]\n"
+    "fmla v0.4s, v21.4s, v11.4s\n"
+    "fmla v16.4s, v21.4s, v9.4s\n"
+    "fmla v18.4s, v21.4s, v12.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v23.4s, v21.4s, v17.4s\n"
+    "ldr q21, [x24, x12]\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v16.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v9.4s\n"
+    "fmla v18.4s, v29.4s, v13.4s\n"
+    "fmla v19.4s, v29.4s, v12.4s\n"
+    "fmla v22.4s, v29.4s, v10.4s\n"
+    "fmla v23.4s, v29.4s, v14.4s\n"
+    "fmla v25.4s, v29.4s, v17.4s\n"
+    "fmla v24.4s, v29.4s, v20.4s\n"
+    "ldr q28, [x9, x14]\n"
+    "fmla v5.4s, v21.4s, v8.4s\n"
+    "ldr q27, [x27, x28]\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v4.4s, v21.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "fmla v19.4s, v21.4s, v13.4s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla v22.4s, v21.4s, v12.4s\n"
+    "fmla v25.4s, v21.4s, v14.4s\n"
+    "fmla v4.4s, v28.4s, v11.4s\n"
+    "ldr q20, [x10, x23]\n"
+    "fmla v0.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "fmla v22.4s, v28.4s, v13.4s\n"
+    "ldr q26, [x26, x12]\n"
+    "fmla v23.4s, v27.4s, v10.4s\n"
+    "ldr q21, [x24, x14]\n"
+    "fmla v16.4s, v20.4s, v8.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v18.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v19.4s, v20.4s, v9.4s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla v23.4s, v20.4s, v12.4s\n"
+    "fmla v25.4s, v20.4s, v10.4s\n"
+    "fmla v24.4s, v20.4s, v17.4s\n"
+    "ldr q28, [x27, x23]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr q20, [x10, x12]\n"
+    "fmla v19.4s, v26.4s, v11.4s\n"
+    "fmla v22.4s, v26.4s, v9.4s\n"
+    "fmla v23.4s, v26.4s, v13.4s\n"
+    "fmla v25.4s, v26.4s, v12.4s\n"
+    "fmla v24.4s, v26.4s, v14.4s\n"
+    "ldr q17, [x26, x14]\n"
+    "fmla v4.4s, v21.4s, v8.4s\n"
+    "ldr q26, [x27, x12]\n"
+    "fmla v22.4s, v21.4s, v11.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v25.4s, v21.4s, v13.4s\n"
+    "ldr q27, [x10, x14]\n"
+    "fmla v18.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v23.4s, v28.4s, v9.4s\n"
+    "add x10, x10, #16\n"
+    "fmla v24.4s, v28.4s, v10.4s\n"
+    "ldr q28, [x27, x14]\n"
+    "fmla v19.4s, v20.4s, v8.4s\n"
+    "ldr q21, [%[wbptr]]\n"
+    "fmla v23.4s, v20.4s, v11.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v25.4s, v20.4s, v9.4s\n"
+    "fmla v24.4s, v20.4s, v12.4s\n"
+    "fmla v22.4s, v17.4s, v8.4s\n"
+    "ldr q20, [%[wbptr], #16]\n"
+    "fmla v23.4s, v26.4s, v8.4s\n"
+    "ldr q14, [%[wbptr], #32]\n"
+    "fmla v24.4s, v17.4s, v13.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v25.4s, v17.4s, v11.4s\n"
+    "ldr q17, [%[wbptr], #64]\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "ldr q13, [%[wbptr], #48]\n"
+    "str q7, [%[outptr0]]\n"
+    "fmla v25.4s, v27.4s, v8.4s\n"
+    "str q6, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "ldr q12, [%[wbptr], #80]\n"
+    "str q5, [%[outptr0], x19]\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "ldr q10, [%[wbptr], #112]\n"
+    "str q4, [%[outptr0], x20]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "str q3, [x16]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr q11, [%[wbptr], #96]\n"
+    "str q2, [x16, %[output_col_stride1]]\n"
+    "fmax v22.4s, v22.4s, v29.4s\n"
+    "str q1, [x16, x19]\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str q22, [x16, x20]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "str q15, [x17]\n"
+    "fmax v19.4s, v19.4s, v29.4s\n"
+    "str q16, [x17, %[output_col_stride1]]\n"
+    "fmax v25.4s, v25.4s, v29.4s\n"
+    "str q19, [x17, x19]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str q25, [x17, x20]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "str q0, [x18]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str q18, [x18, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "str q23, [x18, x19]\n"
+    "mov v7.16b, v21.16b\n"
+    "str q24, [x18, x20]\n"
+    "mov v3.16b, v21.16b\n"
+    "mov v6.16b, v21.16b\n"
+    "ldr q9, [%[wbptr], #128]\n"
+    "mov v15.16b, v21.16b\n"
+    "ldr q8, [%[wbptr], #144]\n"
+    "mov v2.16b, v21.16b\n"
+    "ldr q22, [%[inptr0]]\n"
+    "mov v5.16b, v21.16b\n"
+    "ldr q19, [x9]\n"
+    "mov v0.16b, v21.16b\n"
+    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+    "mov v16.16b, v21.16b\n"
+    "ldr q18, [x24]\n"
+    "mov v1.16b, v21.16b\n"
+    "ldr q27, [x9, %[input_col_stride1]]\n"
+    "mov v4.16b, v21.16b\n"
+    "ldr q28, [%[inptr0], x28]\n"
+    "fmla v7.4s, v22.4s, v20.4s\n"
+    "ldr q25, [x26]\n"
+    "fmla v3.4s, v19.4s, v20.4s\n"
+    "ldr q22, [x24, %[input_col_stride1]]\n"
+    "fmla v6.4s, v23.4s, v20.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmla v7.4s, v19.4s, v17.4s\n"
+    "add x16, x16, #16\n"
+    "fmla v3.4s, v18.4s, v17.4s\n"
+    "add x17, x17, #16\n"
+    "fmla v15.4s, v18.4s, v20.4s\n"
+    "add x18, x18, #16\n"
+    "fmla v7.4s, v23.4s, v14.4s\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "fmla v7.4s, v18.4s, v10.4s\n"
+    "fmla v7.4s, v27.4s, v12.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "mov v18.16b, v21.16b\n"
+    "ldr q23, [x9, x28]\n"
+    "mov v19.16b, v21.16b\n"
+    "prfm pldl1keep, [x9, x25]\n"
+    "fmla v6.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x11]\n"
+    "fmla v2.4s, v27.4s, v20.4s\n"
+    "ldr q24, [%[inptr0], x23]\n"
+    "fmla v7.4s, v28.4s, v13.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v6.4s, v28.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla v5.4s, v28.4s, v20.4s\n"
+    "ldr q26, [x10]\n"
+    "fmla v3.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x25]\n"
+    "fmla v15.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x9, x11]\n"
+    "fmla v0.4s, v25.4s, v20.4s\n"
+    "ldr q25, [x26, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x13]\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, #64]\n"
+    "fmla v6.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla v15.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x25]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "prfm pldl1keep, [x24, x11]\n"
+    "fmla v16.4s, v22.4s, v20.4s\n"
+    "ldr q22, [x24, x28]\n"
+    "fmla v7.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x13]\n"
+    "fmla v3.4s, v23.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v6.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "prfm pldl1keep, [x26, x11]\n"
+    "fmla v1.4s, v23.4s, v20.4s\n"
+    "ldr q23, [x9, x23]\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x24, x13]\n"
+    "fmla v5.4s, v24.4s, v14.4s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "fmla v4.4s, v24.4s, v20.4s\n"
+    "ldr q24, [%[inptr0], x12]\n"
+    "fmla v15.4s, v26.4s, v10.4s\n"
+    "prfm pldl1keep, [x27, x25]\n"
+    "fmla v0.4s, v26.4s, v17.4s\n"
+    "ldr q29, [x27]\n"
+    "fmla v3.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x11]\n"
+    "fmla v15.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x26, x13]\n"
+    "fmla v2.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "prfm pldl1keep, [x27, x11]\n"
+    "fmla v16.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x10, x13]\n"
+    "fmla v18.4s, v25.4s, v20.4s\n"
+    "ldr q26, [x10, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, x15]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x27, x13]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x27, x15]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v5.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v22.4s, v14.4s\n"
+    "fmla v1.4s, v22.4s, v17.4s\n"
+    "fmla v19.4s, v22.4s, v20.4s\n"
+    "ldr q27, [x26, x28]\n"
+    "fmla v6.4s, v23.4s, v11.4s\n"
+    "fmla v2.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v23.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v14.4s\n"
+    "fmla v4.4s, v23.4s, v17.4s\n"
+    "fmla v0.4s, v29.4s, v10.4s\n"
+    "mov v22.16b, v21.16b\n"
+    "fmla v15.4s, v26.4s, v9.4s\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v22.4s, v23.4s, v20.4s\n"
+    "ldr q29, [x24, x23]\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "ldr q28, [x9, x12]\n"
+    "fmla v0.4s, v26.4s, v12.4s\n"
+    "fmla v18.4s, v26.4s, v17.4s\n"
+    "mov v23.16b, v21.16b\n"
+    "fmla v3.4s, v27.4s, v8.4s\n"
+    "fmla v15.4s, v27.4s, v11.4s\n"
+    "fmla v2.4s, v27.4s, v9.4s\n"
+    "fmla v0.4s, v27.4s, v13.4s\n"
+    "fmla v16.4s, v27.4s, v12.4s\n"
+    "fmla v1.4s, v27.4s, v10.4s\n"
+    "fmla v18.4s, v27.4s, v14.4s\n"
+    "fmla v19.4s, v27.4s, v17.4s\n"
+    "fmla v23.4s, v27.4s, v20.4s\n"
+    "mov v25.16b, v21.16b\n"
+    "mov v24.16b, v21.16b\n"
+    "fmla v6.4s, v29.4s, v8.4s\n"
+    "fmla v2.4s, v29.4s, v11.4s\n"
+    "fmla v5.4s, v29.4s, v9.4s\n"
+    "fmla v16.4s, v29.4s, v13.4s\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "fmla v4.4s, v29.4s, v10.4s\n"
+    "fmla v19.4s, v29.4s, v14.4s\n"
+    "fmla v22.4s, v29.4s, v17.4s\n"
+    "fmla v25.4s, v29.4s, v20.4s\n"
+    "ldr q21, [%[inptr0], x14]\n"
+    "fmla v5.4s, v28.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v1.4s, v28.4s, v13.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v22.4s, v28.4s, v14.4s\n"
+    "ldr q26, [x27, %[input_col_stride1]]\n"
+    "fmla v0.4s, v26.4s, v9.4s\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "fmla v4.4s, v21.4s, v13.4s\n"
+    "ldr q21, [x10, x28]\n"
+    "fmla v15.4s, v21.4s, v8.4s\n"
+    "ldr q29, [x26, x23]\n"
+    "fmla v0.4s, v21.4s, v11.4s\n"
+    "fmla v16.4s, v21.4s, v9.4s\n"
+    "fmla v18.4s, v21.4s, v12.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v23.4s, v21.4s, v17.4s\n"
+    "ldr q21, [x24, x12]\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v16.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v9.4s\n"
+    "fmla v18.4s, v29.4s, v13.4s\n"
+    "fmla v19.4s, v29.4s, v12.4s\n"
+    "fmla v22.4s, v29.4s, v10.4s\n"
+    "fmla v23.4s, v29.4s, v14.4s\n"
+    "fmla v25.4s, v29.4s, v17.4s\n"
+    "fmla v24.4s, v29.4s, v20.4s\n"
+    "ldr q28, [x9, x14]\n"
+    "fmla v5.4s, v21.4s, v8.4s\n"
+    "ldr q27, [x27, x28]\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v4.4s, v21.4s, v9.4s\n"
+    "fmla v19.4s, v21.4s, v13.4s\n"
+    "fmla v22.4s, v21.4s, v12.4s\n"
+    "fmla v25.4s, v21.4s, v14.4s\n"
+    "fmla v0.4s, v27.4s, v8.4s\n"
+    "ldr q20, [x10, x23]\n"
+    "fmla v4.4s, v28.4s, v11.4s\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "fmla v22.4s, v28.4s, v13.4s\n"
+    "ldr q26, [x26, x12]\n"
+    "fmla v23.4s, v27.4s, v10.4s\n"
+    "ldr q21, [x24, x14]\n"
+    "fmla v16.4s, v20.4s, v8.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v18.4s, v20.4s, v11.4s\n"
+    "fmla v19.4s, v20.4s, v9.4s\n"
+    "fmla v23.4s, v20.4s, v12.4s\n"
+    "fmla v25.4s, v20.4s, v10.4s\n"
+    "fmla v24.4s, v20.4s, v17.4s\n"
+    "ldr q28, [x27, x23]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr q20, [x10, x12]\n"
+    "fmla v19.4s, v26.4s, v11.4s\n"
+    "fmla v22.4s, v26.4s, v9.4s\n"
+    "fmla v23.4s, v26.4s, v13.4s\n"
+    "fmla v25.4s, v26.4s, v12.4s\n"
+    "fmla v24.4s, v26.4s, v14.4s\n"
+    "ldr q17, [x26, x14]\n"
+    "fmla v4.4s, v21.4s, v8.4s\n"
+    "ldr q26, [x27, x12]\n"
+    "fmla v22.4s, v21.4s, v11.4s\n"
+    "add x26, x26, #16\n"
+    "fmla v25.4s, v21.4s, v13.4s\n"
+    "ldr q27, [x10, x14]\n"
+    "fmla v18.4s, v28.4s, v8.4s\n"
+    "add x10, x10, #16\n"
+    "fmla v23.4s, v28.4s, v9.4s\n"
+    "fmla v24.4s, v28.4s, v10.4s\n"
+    "fmla v19.4s, v20.4s, v8.4s\n"
+    "ldr q28, [x27, x14]\n"
+    "fmla v25.4s, v20.4s, v9.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v23.4s, v20.4s, v11.4s\n"
+    "fmla v24.4s, v20.4s, v12.4s\n"
+    "fmla v22.4s, v17.4s, v8.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v25.4s, v17.4s, v11.4s\n"
+    "fmla v24.4s, v17.4s, v13.4s\n"
+    "fmla v23.4s, v26.4s, v8.4s\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmla v25.4s, v27.4s, v8.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "str q7, [%[outptr0]]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "str q6, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "str q5, [%[outptr0], x19]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "str q4, [%[outptr0], x20]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "str q3, [x16]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "str q2, [x16, %[output_col_stride1]]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "str q1, [x16, x19]\n"
+    "fmax v22.4s, v22.4s, v29.4s\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "str q22, [x16, x20]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "str q15, [x17]\n"
+    "fmax v19.4s, v19.4s, v29.4s\n"
+    "str q16, [x17, %[output_col_stride1]]\n"
+    "fmax v25.4s, v25.4s, v29.4s\n"
+    "str q19, [x17, x19]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str q25, [x17, x20]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "str q0, [x18]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str q18, [x18, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "str q23, [x18, x19]\n"
+    "add x16, x16, #16\n"
+    "str q24, [x18, x20]\n"
+    "add x17, x17, #16\n"
+    "add x18, x18, #16\n"
+    "4:\n"
+    "cbz x21, 7f\n"
+    "ldr s21, [%[wbptr]]\n"
+    "mov v7.16b, v21.16b\n"
+    "ldr s20, [%[wbptr], #4]\n"
+    "mov v3.16b, v21.16b\n"
+    "ldr s14, [%[wbptr], #8]\n"
+    "mov v6.16b, v21.16b\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "mov v15.16b, v21.16b\n"
+    "ldr s17, [%[wbptr], #16]\n"
+    "mov v2.16b, v21.16b\n"
+    "ldr s12, [%[wbptr], #20]\n"
+    "mov v5.16b, v21.16b\n"
+    "ldr s11, [%[wbptr], #24]\n"
+    "mov v0.16b, v21.16b\n"
+    "ldr s10, [%[wbptr], #28]\n"
+    "mov v16.16b, v21.16b\n"
+    "ldr s9, [%[wbptr], #32]\n"
+    "mov v1.16b, v21.16b\n"
+    "ldr s8, [%[wbptr], #36]\n"
+    "mov v4.16b, v21.16b\n"
+    "ldr s22, [%[inptr0]]\n"
+    "fmla v7.4s, v22.4s, v20.4s\n"
+    "ldr s19, [x9]\n"
+    "fmla v3.4s, v19.4s, v20.4s\n"
+    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v6.4s, v23.4s, v20.4s\n"
+    "ldr s18, [x24]\n"
+    "fmla v7.4s, v19.4s, v17.4s\n"
+    "ldr s27, [x9, %[input_col_stride1]]\n"
+    "fmla v3.4s, v18.4s, v17.4s\n"
+    "ldr s28, [%[inptr0], x28]\n"
+    "fmla v15.4s, v18.4s, v20.4s\n"
+    "ldr s25, [x26]\n"
+    "fmla v7.4s, v23.4s, v14.4s\n"
+    "ldr s22, [x24, %[input_col_stride1]]\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "subs x21, x21, #1\n"
+    "prfm pldl1keep, [%[inptr0], x8]\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v7.4s, v18.4s, v10.4s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "prfm pldl1keep, [%[inptr0], x25]\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla v7.4s, v27.4s, v12.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "mov v18.16b, v21.16b\n"
+    "ldr s23, [x9, x28]\n"
+    "mov v19.16b, v21.16b\n"
+    "prfm pldl1keep, [x9, x25]\n"
+    "fmla v6.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x11]\n"
+    "fmla v2.4s, v27.4s, v20.4s\n"
+    "ldr s24, [%[inptr0], x23]\n"
+    "fmla v7.4s, v28.4s, v13.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v6.4s, v28.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla v5.4s, v28.4s, v20.4s\n"
+    "ldr s26, [x10]\n"
+    "fmla v3.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x25]\n"
+    "fmla v15.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x9, x11]\n"
+    "fmla v0.4s, v25.4s, v20.4s\n"
+    "ldr s25, [x26, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x13]\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, #64]\n"
+    "fmla v6.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla v15.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x25]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "prfm pldl1keep, [x24, x11]\n"
+    "fmla v16.4s, v22.4s, v20.4s\n"
+    "ldr s22, [x24, x28]\n"
+    "fmla v7.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x13]\n"
+    "fmla v3.4s, v23.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v6.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "prfm pldl1keep, [x26, x11]\n"
+    "fmla v1.4s, v23.4s, v20.4s\n"
+    "ldr s23, [x9, x23]\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x24, x13]\n"
+    "fmla v5.4s, v24.4s, v14.4s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "fmla v4.4s, v24.4s, v20.4s\n"
+    "ldr s24, [%[inptr0], x12]\n"
+    "fmla v15.4s, v26.4s, v10.4s\n"
+    "prfm pldl1keep, [x27, x25]\n"
+    "fmla v0.4s, v26.4s, v17.4s\n"
+    "ldr s29, [x27]\n"
+    "fmla v3.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x11]\n"
+    "fmla v15.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x26, x13]\n"
+    "fmla v2.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "prfm pldl1keep, [x27, x11]\n"
+    "fmla v16.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x10, x13]\n"
+    "fmla v18.4s, v25.4s, v20.4s\n"
+    "ldr s26, [x10, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, x15]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x27, x13]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x27, x15]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v5.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v22.4s, v14.4s\n"
+    "subs x21, x21, #1\n"
+    "fmla v1.4s, v22.4s, v17.4s\n"
+    "fmla v19.4s, v22.4s, v20.4s\n"
+    "mov v22.16b, v21.16b\n"
+    "fmla v6.4s, v23.4s, v11.4s\n"
+    "fmla v2.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v23.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v14.4s\n"
+    "fmla v4.4s, v23.4s, v17.4s\n"
+    "fmla v22.4s, v23.4s, v20.4s\n"
+    "ldr s27, [x26, x28]\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "fmla v0.4s, v29.4s, v10.4s\n"
+    "mov v23.16b, v21.16b\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "mov v25.16b, v21.16b\n"
+    "mov v24.16b, v21.16b\n"
+    "fmla v15.4s, v26.4s, v9.4s\n"
+    "fmla v0.4s, v26.4s, v12.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v18.4s, v26.4s, v17.4s\n"
+    "fmla v3.4s, v27.4s, v8.4s\n"
+    "ldr s29, [x24, x23]\n"
+    "fmla v15.4s, v27.4s, v11.4s\n"
+    "fmla v2.4s, v27.4s, v9.4s\n"
+    "fmla v0.4s, v27.4s, v13.4s\n"
+    "fmla v16.4s, v27.4s, v12.4s\n"
+    "fmla v1.4s, v27.4s, v10.4s\n"
+    "fmla v18.4s, v27.4s, v14.4s\n"
+    "fmla v19.4s, v27.4s, v17.4s\n"
+    "fmla v23.4s, v27.4s, v20.4s\n"
+    "fmla v6.4s, v29.4s, v8.4s\n"
+    "ldr s28, [x9, x12]\n"
+    "fmla v2.4s, v29.4s, v11.4s\n"
+    "fmla v5.4s, v29.4s, v9.4s\n"
+    "fmla v16.4s, v29.4s, v13.4s\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "fmla v4.4s, v29.4s, v10.4s\n"
+    "fmla v19.4s, v29.4s, v14.4s\n"
+    "fmla v22.4s, v29.4s, v17.4s\n"
+    "fmla v25.4s, v29.4s, v20.4s\n"
+    "fmla v5.4s, v28.4s, v11.4s\n"
+    "ldr s21, [%[inptr0], x14]\n"
+    "fmla v1.4s, v28.4s, v13.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v22.4s, v28.4s, v14.4s\n"
+    "ldr s26, [x27, %[input_col_stride1]]\n"
+    "fmla v0.4s, v26.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x8]\n"
+    "fmla v4.4s, v21.4s, v13.4s\n"
+    "ldr s21, [x10, x28]\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "ldr s29, [x26, x23]\n"
+    "fmla v15.4s, v21.4s, v8.4s\n"
+    "prfm pldl1keep, [%[inptr0], x25]\n"
+    "fmla v0.4s, v21.4s, v11.4s\n"
+    "fmla v16.4s, v21.4s, v9.4s\n"
+    "fmla v18.4s, v21.4s, v12.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v23.4s, v21.4s, v17.4s\n"
+    "ldr s21, [x24, x12]\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v16.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v9.4s\n"
+    "fmla v18.4s, v29.4s, v13.4s\n"
+    "fmla v19.4s, v29.4s, v12.4s\n"
+    "fmla v22.4s, v29.4s, v10.4s\n"
+    "fmla v23.4s, v29.4s, v14.4s\n"
+    "fmla v25.4s, v29.4s, v17.4s\n"
+    "fmla v24.4s, v29.4s, v20.4s\n"
+    "ldr s28, [x9, x14]\n"
+    "fmla v5.4s, v21.4s, v8.4s\n"
+    "ldr s27, [x27, x28]\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v4.4s, v21.4s, v9.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "fmla v19.4s, v21.4s, v13.4s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla v22.4s, v21.4s, v12.4s\n"
+    "fmla v25.4s, v21.4s, v14.4s\n"
+    "fmla v4.4s, v28.4s, v11.4s\n"
+    "ldr s20, [x10, x23]\n"
+    "fmla v0.4s, v27.4s, v8.4s\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "fmla v22.4s, v28.4s, v13.4s\n"
+    "ldr s26, [x26, x12]\n"
+    "fmla v23.4s, v27.4s, v10.4s\n"
+    "ldr s21, [x24, x14]\n"
+    "fmla v16.4s, v20.4s, v8.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v18.4s, v20.4s, v11.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v19.4s, v20.4s, v9.4s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla v23.4s, v20.4s, v12.4s\n"
+    "fmla v25.4s, v20.4s, v10.4s\n"
+    "fmla v24.4s, v20.4s, v17.4s\n"
+    "ldr s28, [x27, x23]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr s20, [x10, x12]\n"
+    "fmla v19.4s, v26.4s, v11.4s\n"
+    "fmla v22.4s, v26.4s, v9.4s\n"
+    "fmla v23.4s, v26.4s, v13.4s\n"
+    "fmla v25.4s, v26.4s, v12.4s\n"
+    "fmla v24.4s, v26.4s, v14.4s\n"
+    "ldr s17, [x26, x14]\n"
+    "fmla v4.4s, v21.4s, v8.4s\n"
+    "ldr s26, [x27, x12]\n"
+    "fmla v22.4s, v21.4s, v11.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v25.4s, v21.4s, v13.4s\n"
+    "ldr s27, [x10, x14]\n"
+    "fmla v18.4s, v28.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, #64]\n"
+    "fmla v23.4s, v28.4s, v9.4s\n"
+    "add x10, x10, #4\n"
+    "fmla v24.4s, v28.4s, v10.4s\n"
+    "ldr s28, [x27, x14]\n"
+    "fmla v19.4s, v20.4s, v8.4s\n"
+    "ldr s21, [%[wbptr]]\n"
+    "fmla v23.4s, v20.4s, v11.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v25.4s, v20.4s, v9.4s\n"
+    "fmla v24.4s, v20.4s, v12.4s\n"
+    "fmla v22.4s, v17.4s, v8.4s\n"
+    "ldr s20, [%[wbptr], #4]\n"
+    "fmla v23.4s, v26.4s, v8.4s\n"
+    "ldr s14, [%[wbptr], #8]\n"
+    "fmla v24.4s, v17.4s, v13.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v25.4s, v17.4s, v11.4s\n"
+    "ldr s17, [%[wbptr], #16]\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "ldr s13, [%[wbptr], #12]\n"
+    "str s7, [%[outptr0]]\n"
+    "fmla v25.4s, v27.4s, v8.4s\n"
+    "str s6, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "ldr s12, [%[wbptr], #20]\n"
+    "str s5, [%[outptr0], x19]\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "ldr s10, [%[wbptr], #28]\n"
+    "str s4, [%[outptr0], x20]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "str s3, [x16]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr s11, [%[wbptr], #24]\n"
+    "str s2, [x16, %[output_col_stride1]]\n"
+    "fmax v22.4s, v22.4s, v29.4s\n"
+    "str s1, [x16, x19]\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str s22, [x16, x20]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "str s15, [x17]\n"
+    "fmax v19.4s, v19.4s, v29.4s\n"
+    "str s16, [x17, %[output_col_stride1]]\n"
+    "fmax v25.4s, v25.4s, v29.4s\n"
+    "str s19, [x17, x19]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str s25, [x17, x20]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "str s0, [x18]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str s18, [x18, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "str s23, [x18, x19]\n"
+    "mov v7.16b, v21.16b\n"
+    "str s24, [x18, x20]\n"
+    "mov v3.16b, v21.16b\n"
+    "mov v6.16b, v21.16b\n"
+    "ldr s9, [%[wbptr], #32]\n"
+    "mov v15.16b, v21.16b\n"
+    "ldr s8, [%[wbptr], #36]\n"
+    "mov v2.16b, v21.16b\n"
+    "ldr s22, [%[inptr0]]\n"
+    "mov v5.16b, v21.16b\n"
+    "ldr s19, [x9]\n"
+    "mov v0.16b, v21.16b\n"
+    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+    "mov v16.16b, v21.16b\n"
+    "ldr s18, [x24]\n"
+    "mov v1.16b, v21.16b\n"
+    "ldr s27, [x9, %[input_col_stride1]]\n"
+    "mov v4.16b, v21.16b\n"
+    "ldr s28, [%[inptr0], x28]\n"
+    "fmla v7.4s, v22.4s, v20.4s\n"
+    "ldr s25, [x26]\n"
+    "fmla v3.4s, v19.4s, v20.4s\n"
+    "ldr s22, [x24, %[input_col_stride1]]\n"
+    "fmla v6.4s, v23.4s, v20.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmla v7.4s, v19.4s, v17.4s\n"
+    "add x16, x16, #4\n"
+    "fmla v3.4s, v18.4s, v17.4s\n"
+    "add x17, x17, #4\n"
+    "fmla v15.4s, v18.4s, v20.4s\n"
+    "add x18, x18, #4\n"
+    "fmla v7.4s, v23.4s, v14.4s\n"
+    "fmla v3.4s, v27.4s, v14.4s\n"
+    "fmla v7.4s, v18.4s, v10.4s\n"
+    "fmla v7.4s, v27.4s, v12.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "mov v18.16b, v21.16b\n"
+    "ldr s23, [x9, x28]\n"
+    "mov v19.16b, v21.16b\n"
+    "prfm pldl1keep, [x9, x25]\n"
+    "fmla v6.4s, v27.4s, v17.4s\n"
+    "prfm pldl1keep, [%[inptr0], x11]\n"
+    "fmla v2.4s, v27.4s, v20.4s\n"
+    "ldr s24, [%[inptr0], x23]\n"
+    "fmla v7.4s, v28.4s, v13.4s\n"
+    "prfm pldl1keep, [x10, #64]\n"
+    "fmla v6.4s, v28.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla v5.4s, v28.4s, v20.4s\n"
+    "ldr s26, [x10]\n"
+    "fmla v3.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x25]\n"
+    "fmla v15.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x9, x11]\n"
+    "fmla v0.4s, v25.4s, v20.4s\n"
+    "ldr s25, [x26, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [%[inptr0], x13]\n"
+    "fmla v3.4s, v22.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, #64]\n"
+    "fmla v6.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla v15.4s, v22.4s, v14.4s\n"
+    "prfm pldl1keep, [x26, x25]\n"
+    "fmla v2.4s, v22.4s, v17.4s\n"
+    "prfm pldl1keep, [x24, x11]\n"
+    "fmla v16.4s, v22.4s, v20.4s\n"
+    "ldr s22, [x24, x28]\n"
+    "fmla v7.4s, v23.4s, v11.4s\n"
+    "prfm pldl1keep, [x9, x13]\n"
+    "fmla v3.4s, v23.4s, v13.4s\n"
+    "prfm pldl1keep, [%[inptr0], x15]\n"
+    "fmla v6.4s, v23.4s, v12.4s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla v2.4s, v23.4s, v14.4s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "fmla v5.4s, v23.4s, v17.4s\n"
+    "prfm pldl1keep, [x26, x11]\n"
+    "fmla v1.4s, v23.4s, v20.4s\n"
+    "ldr s23, [x9, x23]\n"
+    "fmla v6.4s, v24.4s, v13.4s\n"
+    "prfm pldl1keep, [x24, x13]\n"
+    "fmla v5.4s, v24.4s, v14.4s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "fmla v4.4s, v24.4s, v20.4s\n"
+    "ldr s24, [%[inptr0], x12]\n"
+    "fmla v15.4s, v26.4s, v10.4s\n"
+    "prfm pldl1keep, [x27, x25]\n"
+    "fmla v0.4s, v26.4s, v17.4s\n"
+    "ldr s29, [x27]\n"
+    "fmla v3.4s, v25.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x11]\n"
+    "fmla v15.4s, v25.4s, v12.4s\n"
+    "prfm pldl1keep, [x26, x13]\n"
+    "fmla v2.4s, v25.4s, v10.4s\n"
+    "prfm pldl1keep, [x24, x15]\n"
+    "fmla v0.4s, v25.4s, v14.4s\n"
+    "prfm pldl1keep, [x27, x11]\n"
+    "fmla v16.4s, v25.4s, v17.4s\n"
+    "prfm pldl1keep, [x10, x13]\n"
+    "fmla v18.4s, v25.4s, v20.4s\n"
+    "ldr s26, [x10, %[input_col_stride1]]\n"
+    "fmla v7.4s, v22.4s, v8.4s\n"
+    "prfm pldl1keep, [x26, x15]\n"
+    "fmla v3.4s, v22.4s, v11.4s\n"
+    "prfm pldl1keep, [x27, x13]\n"
+    "fmla v6.4s, v22.4s, v9.4s\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "fmla v15.4s, v22.4s, v13.4s\n"
+    "prfm pldl1keep, [x27, x15]\n"
+    "fmla v2.4s, v22.4s, v12.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v5.4s, v22.4s, v10.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v22.4s, v14.4s\n"
+    "fmla v1.4s, v22.4s, v17.4s\n"
+    "fmla v19.4s, v22.4s, v20.4s\n"
+    "ldr s27, [x26, x28]\n"
+    "fmla v6.4s, v23.4s, v11.4s\n"
+    "fmla v2.4s, v23.4s, v13.4s\n"
+    "fmla v5.4s, v23.4s, v12.4s\n"
+    "fmla v1.4s, v23.4s, v14.4s\n"
+    "fmla v4.4s, v23.4s, v17.4s\n"
+    "fmla v0.4s, v29.4s, v10.4s\n"
+    "mov v22.16b, v21.16b\n"
+    "fmla v15.4s, v26.4s, v9.4s\n"
+    "fmla v5.4s, v24.4s, v13.4s\n"
+    "fmla v16.4s, v26.4s, v10.4s\n"
+    "fmla v22.4s, v23.4s, v20.4s\n"
+    "ldr s29, [x24, x23]\n"
+    "fmla v4.4s, v24.4s, v14.4s\n"
+    "ldr s28, [x9, x12]\n"
+    "fmla v0.4s, v26.4s, v12.4s\n"
+    "fmla v18.4s, v26.4s, v17.4s\n"
+    "mov v23.16b, v21.16b\n"
+    "fmla v3.4s, v27.4s, v8.4s\n"
+    "fmla v15.4s, v27.4s, v11.4s\n"
+    "fmla v2.4s, v27.4s, v9.4s\n"
+    "fmla v0.4s, v27.4s, v13.4s\n"
+    "fmla v16.4s, v27.4s, v12.4s\n"
+    "fmla v1.4s, v27.4s, v10.4s\n"
+    "fmla v18.4s, v27.4s, v14.4s\n"
+    "fmla v19.4s, v27.4s, v17.4s\n"
+    "fmla v23.4s, v27.4s, v20.4s\n"
+    "mov v25.16b, v21.16b\n"
+    "mov v24.16b, v21.16b\n"
+    "fmla v6.4s, v29.4s, v8.4s\n"
+    "fmla v2.4s, v29.4s, v11.4s\n"
+    "fmla v5.4s, v29.4s, v9.4s\n"
+    "fmla v16.4s, v29.4s, v13.4s\n"
+    "fmla v1.4s, v29.4s, v12.4s\n"
+    "fmla v4.4s, v29.4s, v10.4s\n"
+    "fmla v19.4s, v29.4s, v14.4s\n"
+    "fmla v22.4s, v29.4s, v17.4s\n"
+    "fmla v25.4s, v29.4s, v20.4s\n"
+    "ldr s21, [%[inptr0], x14]\n"
+    "fmla v5.4s, v28.4s, v11.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v1.4s, v28.4s, v13.4s\n"
+    "fmla v4.4s, v28.4s, v12.4s\n"
+    "fmla v22.4s, v28.4s, v14.4s\n"
+    "ldr s26, [x27, %[input_col_stride1]]\n"
+    "fmla v0.4s, v26.4s, v9.4s\n"
+    "fmla v18.4s, v26.4s, v10.4s\n"
+    "fmla v4.4s, v21.4s, v13.4s\n"
+    "ldr s21, [x10, x28]\n"
+    "fmla v15.4s, v21.4s, v8.4s\n"
+    "ldr s29, [x26, x23]\n"
+    "fmla v0.4s, v21.4s, v11.4s\n"
+    "fmla v16.4s, v21.4s, v9.4s\n"
+    "fmla v18.4s, v21.4s, v12.4s\n"
+    "fmla v19.4s, v21.4s, v10.4s\n"
+    "fmla v23.4s, v21.4s, v17.4s\n"
+    "ldr s21, [x24, x12]\n"
+    "fmla v2.4s, v29.4s, v8.4s\n"
+    "fmla v16.4s, v29.4s, v11.4s\n"
+    "fmla v1.4s, v29.4s, v9.4s\n"
+    "fmla v18.4s, v29.4s, v13.4s\n"
+    "fmla v19.4s, v29.4s, v12.4s\n"
+    "fmla v22.4s, v29.4s, v10.4s\n"
+    "fmla v23.4s, v29.4s, v14.4s\n"
+    "fmla v25.4s, v29.4s, v17.4s\n"
+    "fmla v24.4s, v29.4s, v20.4s\n"
+    "ldr s28, [x9, x14]\n"
+    "fmla v5.4s, v21.4s, v8.4s\n"
+    "ldr s27, [x27, x28]\n"
+    "fmla v1.4s, v21.4s, v11.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v4.4s, v21.4s, v9.4s\n"
+    "fmla v19.4s, v21.4s, v13.4s\n"
+    "fmla v22.4s, v21.4s, v12.4s\n"
+    "fmla v25.4s, v21.4s, v14.4s\n"
+    "fmla v0.4s, v27.4s, v8.4s\n"
+    "ldr s20, [x10, x23]\n"
+    "fmla v4.4s, v28.4s, v11.4s\n"
+    "fmla v18.4s, v27.4s, v9.4s\n"
+    "fmla v22.4s, v28.4s, v13.4s\n"
+    "ldr s26, [x26, x12]\n"
+    "fmla v23.4s, v27.4s, v10.4s\n"
+    "ldr s21, [x24, x14]\n"
+    "fmla v16.4s, v20.4s, v8.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v18.4s, v20.4s, v11.4s\n"
+    "fmla v19.4s, v20.4s, v9.4s\n"
+    "fmla v23.4s, v20.4s, v12.4s\n"
+    "fmla v25.4s, v20.4s, v10.4s\n"
+    "fmla v24.4s, v20.4s, v17.4s\n"
+    "ldr s28, [x27, x23]\n"
+    "fmla v1.4s, v26.4s, v8.4s\n"
+    "ldr s20, [x10, x12]\n"
+    "fmla v19.4s, v26.4s, v11.4s\n"
+    "fmla v22.4s, v26.4s, v9.4s\n"
+    "fmla v23.4s, v26.4s, v13.4s\n"
+    "fmla v25.4s, v26.4s, v12.4s\n"
+    "fmla v24.4s, v26.4s, v14.4s\n"
+    "ldr s17, [x26, x14]\n"
+    "fmla v4.4s, v21.4s, v8.4s\n"
+    "ldr s26, [x27, x12]\n"
+    "fmla v22.4s, v21.4s, v11.4s\n"
+    "add x26, x26, #4\n"
+    "fmla v25.4s, v21.4s, v13.4s\n"
+    "ldr s27, [x10, x14]\n"
+    "fmla v18.4s, v28.4s, v8.4s\n"
+    "add x10, x10, #4\n"
+    "fmla v23.4s, v28.4s, v9.4s\n"
+    "fmla v24.4s, v28.4s, v10.4s\n"
+    "fmla v19.4s, v20.4s, v8.4s\n"
+    "ldr s28, [x27, x14]\n"
+    "fmla v25.4s, v20.4s, v9.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v23.4s, v20.4s, v11.4s\n"
+    "fmla v24.4s, v20.4s, v12.4s\n"
+    "fmla v22.4s, v17.4s, v8.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v25.4s, v17.4s, v11.4s\n"
+    "fmla v24.4s, v17.4s, v13.4s\n"
+    "fmla v23.4s, v26.4s, v8.4s\n"
+    "fmax v7.4s, v7.4s, v29.4s\n"
+    "fmla v25.4s, v27.4s, v8.4s\n"
+    "fmax v6.4s, v6.4s, v29.4s\n"
+    "str s7, [%[outptr0]]\n"
+    "fmla v24.4s, v26.4s, v9.4s\n"
+    "str s6, [%[outptr0], %[output_col_stride1]]\n"
+    "fmax v5.4s, v5.4s, v29.4s\n"
+    "fmax v4.4s, v4.4s, v29.4s\n"
+    "fmax v3.4s, v3.4s, v29.4s\n"
+    "str s5, [%[outptr0], x19]\n"
+    "fmla v24.4s, v27.4s, v11.4s\n"
+    "str s4, [%[outptr0], x20]\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "str s3, [x16]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "str s2, [x16, %[output_col_stride1]]\n"
+    "fmla v24.4s, v28.4s, v8.4s\n"
+    "str s1, [x16, x19]\n"
+    "fmax v22.4s, v22.4s, v29.4s\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "str s22, [x16, x20]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "str s15, [x17]\n"
+    "fmax v19.4s, v19.4s, v29.4s\n"
+    "str s16, [x17, %[output_col_stride1]]\n"
+    "fmax v25.4s, v25.4s, v29.4s\n"
+    "str s19, [x17, x19]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str s25, [x17, x20]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "str s0, [x18]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str s18, [x18, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "str s23, [x18, x19]\n"
+    "add x16, x16, #4\n"
+    "str s24, [x18, x20]\n"
+    "add x17, x17, #4\n"
+    "add x18, x18, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+  );
+}
 
-      "uptr1 .req x0\n"
-      "uptr2 .req x1\n"
-      "uptr3 .req x2\n"
-      "uptr4 .req x3\n"
-      "uptr5 .req x4\n"
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *inptrs[6][6],
+  float *outptrs[4][4]
+)
+{
+  __asm __volatile(
+    "mov x27, xzr\n"
+    "mov x28, xzr\n"
+    "and x19, %[n_channels], #3\n"
+    "lsr x26, %[n_channels], #2\n"
+    "cbz x26, 4f\n"
+    "1:\n"
+    "ldr q25, [%[wbptr]]\n"
+    "ldr x25, [%[inptrs], 0]\n"
+    "mov v2.16b, v25.16b\n"
+    "ldr q22, [%[wbptr], #16]\n"
+    "mov v16.16b, v25.16b\n"
+    "ldr q9, [%[wbptr], #32]\n"
+    "mov v18.16b, v25.16b\n"
+    "ldr q8, [%[wbptr], #48]\n"
+    "mov v13.16b, v25.16b\n"
+    "ldr q19, [%[wbptr], #64]\n"
+    "mov v0.16b, v25.16b\n"
+    "ldr q7, [%[wbptr], #80]\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr q6, [%[wbptr], #96]\n"
+    "mov v14.16b, v25.16b\n"
+    "ldr q5, [%[wbptr], #112]\n"
+    "mov v12.16b, v25.16b\n"
+    "ldr q4, [%[wbptr], #128]\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr q3, [%[wbptr], #144]\n"
+    "ldr q27, [x25, x27]\n"
+    "ldr x17, [%[inptrs], 48]\n"
+    "fmla v2.4s, v27.4s, v22.4s\n"
+    "ldr x25, [%[inptrs], 8]\n"
+    "ldr q26, [x17, x27]\n"
+    "ldr x24, [%[inptrs], 96]\n"
+    "fmla v16.4s, v26.4s, v22.4s\n"
+    "ldr q31, [x25, x27]\n"
+    "ldr q28, [x24, x27]\n"
+    "ldr x17, [%[inptrs], 56]\n"
+    "fmla v2.4s, v26.4s, v19.4s\n"
+    "ldr x25, [%[inptrs], 16]\n"
+    "ldr q29, [x17, x27]\n"
+    "ldr x18, [%[inptrs], 144]\n"
+    "ldr x24, [%[inptrs], 104]\n"
+    "subs x26, x26, #1\n"
+    "ldr q30, [x25, x27]\n"
+    "ldr q27, [x18, x27]\n"
+    "ldr q21, [x24, x27]\n"
+    "fmla v2.4s, v31.4s, v9.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "mov v1.16b, v25.16b\n"
+    "ldr x17, [%[inptrs], 64]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr x25, [%[inptrs], 24]\n"
+    "fmla v18.4s, v31.4s, v22.4s\n"
+    "ldr q23, [x17, x27]\n"
+    "fmla v2.4s, v28.4s, v5.4s\n"
+    "ldr x15, [%[inptrs], 192]\n"
+    "fmla v16.4s, v28.4s, v19.4s\n"
+    "ldr x18, [%[inptrs], 152]\n"
+    "fmla v13.4s, v28.4s, v22.4s\n"
+    "ldr q26, [x25, x27]\n"
+    "fmla v18.4s, v29.4s, v19.4s\n"
+    "ldr x24, [%[inptrs], 112]\n"
+    "fmla v2.4s, v29.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 72]\n"
+    "fmla v16.4s, v29.4s, v9.4s\n"
+    "ldr x25, [%[inptrs], 32]\n"
+    "fmla v0.4s, v29.4s, v22.4s\n"
+    "ldr q28, [x15, x27]\n"
+    "fmla v18.4s, v30.4s, v9.4s\n"
+    "ldr x16, [%[inptrs], 240]\n"
+    "fmla v2.4s, v30.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 200]\n"
+    "fmla v17.4s, v30.4s, v22.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v16.4s, v27.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 160]\n"
+    "fmla v13.4s, v27.4s, v19.4s\n"
+    "ldr x20, [%[outptrs], 0]\n"
+    "fmla v14.4s, v27.4s, v22.4s\n"
+    "ldr q20, [x24, x27]\n"
+    "fmla v2.4s, v21.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 120]\n"
+    "fmla v16.4s, v21.4s, v7.4s\n"
+    "ldr x21, [%[outptrs], 32]\n"
+    "fmla v18.4s, v21.4s, v5.4s\n"
+    "ldr x22, [%[outptrs], 64]\n"
+    "fmla v13.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 96]\n"
+    "fmla v0.4s, v21.4s, v19.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v12.4s, v21.4s, v22.4s\n"
+    "ldr q24, [x17, x27]\n"
+    "fmla v2.4s, v23.4s, v6.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v23.4s, v8.4s\n"
+    "ldr x17, [%[inptrs], 80]\n"
+    "fmla v18.4s, v23.4s, v7.4s\n"
+    "subs x26, x26, #1\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "fmla v17.4s, v23.4s, v19.4s\n"
+    "fmla v15.4s, v23.4s, v22.4s\n"
+    "ldr q23, [x25, x27]\n"
+    "fmla v1.4s, v26.4s, v22.4s\n"
+    "ldr x25, [%[inptrs], 40]\n"
+    "fmla v18.4s, v26.4s, v8.4s\n"
+    "fmla v13.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v26.4s, v9.4s\n"
+    "ldr q30, [x16, x27]\n"
+    "fmla v14.4s, v28.4s, v19.4s\n"
+    "ldr q26, [x15, x27]\n"
+    "fmla v16.4s, v29.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 248]\n"
+    "fmla v13.4s, v29.4s, v7.4s\n"
+    "ldr x15, [%[inptrs], 208]\n"
+    "fmla v0.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v19.4s\n"
+    "fmla v14.4s, v29.4s, v9.4s\n"
+    "fmla v10.4s, v29.4s, v22.4s\n"
+    "mov v11.16b, v25.16b\n"
+    "fmla v2.4s, v20.4s, v3.4s\n"
+    "fmla v16.4s, v20.4s, v6.4s\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v13.4s, v20.4s, v8.4s\n"
+    "fmla v0.4s, v20.4s, v7.4s\n"
+    "fmla v17.4s, v20.4s, v5.4s\n"
+    "fmla v12.4s, v20.4s, v9.4s\n"
+    "fmla v15.4s, v20.4s, v19.4s\n"
+    "fmla v11.4s, v20.4s, v22.4s\n"
+    "mov v21.16b, v25.16b\n"
+    "fmla v18.4s, v24.4s, v6.4s\n"
+    "fmla v0.4s, v24.4s, v8.4s\n"
+    "fmla v1.4s, v24.4s, v19.4s\n"
+    "fmla v17.4s, v24.4s, v7.4s\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "mov v20.16b, v25.16b\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "fmla v21.4s, v24.4s, v22.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 168]\n"
+    "fmla v17.4s, v23.4s, v8.4s\n"
+    "ldr q30, [x24, x27]\n"
+    "fmla v13.4s, v26.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 128]\n"
+    "fmla v14.4s, v26.4s, v7.4s\n"
+    "fmla v12.4s, v26.4s, v5.4s\n"
+    "fmla v10.4s, v26.4s, v19.4s\n"
+    "ldr q31, [x17, x27]\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "ldr x17, [%[inptrs], 88]\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v0.4s, v27.4s, v4.4s\n"
+    "fmla v14.4s, v27.4s, v8.4s\n"
+    "fmla v12.4s, v27.4s, v7.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "fmla v10.4s, v27.4s, v9.4s\n"
+    "fmla v11.4s, v27.4s, v19.4s\n"
+    "fmla v20.4s, v27.4s, v22.4s\n"
+    "mov v24.16b, v25.16b\n"
+    "mov v23.16b, v25.16b\n"
+    "fmla v18.4s, v30.4s, v3.4s\n"
+    "fmla v0.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "fmla v12.4s, v30.4s, v8.4s\n"
+    "fmla v15.4s, v30.4s, v7.4s\n"
+    "fmla v1.4s, v30.4s, v5.4s\n"
+    "fmla v11.4s, v30.4s, v9.4s\n"
+    "fmla v21.4s, v30.4s, v19.4s\n"
+    "fmla v24.4s, v30.4s, v22.4s\n"
+    "ldr q25, [x25, x27]\n"
+    "fmla v17.4s, v31.4s, v6.4s\n"
+    "ldr x25, [%[inptrs], 0]\n"
+    "fmla v15.4s, v31.4s, v8.4s\n"
+    "fmla v1.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v31.4s, v9.4s\n"
+    "ldr q26, [x16, x27]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 256]\n"
+    "fmla v10.4s, v26.4s, v5.4s\n"
+    "ldr q31, [x15, x27]\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v13.4s, v31.4s, v3.4s\n"
+    "ldr x15, [%[inptrs], 216]\n"
+    "fmla v14.4s, v31.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 176]\n"
+    "fmla v12.4s, v31.4s, v4.4s\n"
+    "fmla v10.4s, v31.4s, v7.4s\n"
+    "fmla v11.4s, v31.4s, v5.4s\n"
+    "fmla v20.4s, v31.4s, v19.4s\n"
+    "fmla v0.4s, v29.4s, v3.4s\n"
+    "ldr q28, [x24, x27]\n"
+    "fmla v15.4s, v29.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 136]\n"
+    "fmla v12.4s, v29.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v8.4s\n"
+    "fmla v11.4s, v29.4s, v7.4s\n"
+    "fmla v21.4s, v29.4s, v5.4s\n"
+    "fmla v20.4s, v29.4s, v9.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v23.4s, v29.4s, v22.4s\n"
+    "ldr q25, [x17, x27]\n"
+    "fmla v17.4s, v28.4s, v3.4s\n"
+    "ldr q29, [x16, x27]\n"
+    "fmla v15.4s, v28.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 264]\n"
+    "fmla v1.4s, v28.4s, v4.4s\n"
+    "ldr x17, [%[inptrs], 48]\n"
+    "fmla v11.4s, v28.4s, v8.4s\n"
+    "fmla v21.4s, v28.4s, v7.4s\n"
+    "fmla v24.4s, v28.4s, v9.4s\n"
+    "ldr q22, [x15, x27]\n"
+    "fmla v14.4s, v29.4s, v3.4s\n"
+    "ldr x15, [%[inptrs], 224]\n"
+    "fmla v1.4s, v25.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v4.4s\n"
+    "fmla v21.4s, v25.4s, v8.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "fmla v20.4s, v29.4s, v5.4s\n"
+    "ldr q26, [x24, x27]\n"
+    "fmla v12.4s, v22.4s, v3.4s\n"
+    "ldr x18, [%[inptrs], 184]\n"
+    "fmla v10.4s, v22.4s, v6.4s\n"
+    "ldr x24, [%[inptrs], 96]\n"
+    "fmla v11.4s, v22.4s, v4.4s\n"
+    "fmla v24.4s, v22.4s, v5.4s\n"
+    "fmla v20.4s, v22.4s, v7.4s\n"
+    "fmla v23.4s, v22.4s, v19.4s\n"
+    "fmla v15.4s, v27.4s, v3.4s\n"
+    "ldr q25, [x16, x27]\n"
+    "fmla v21.4s, v27.4s, v4.4s\n"
+    "ldr q31, [x15, x27]\n"
+    "fmla v11.4s, v27.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 272]\n"
+    "fmla v20.4s, v27.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 232]\n"
+    "fmla v24.4s, v27.4s, v7.4s\n"
+    "fmla v23.4s, v27.4s, v9.4s\n"
+    "fmla v1.4s, v26.4s, v3.4s\n"
+    "ldr q22, [x18, x27]\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr q19, [x16, x27]\n"
+    "fmla v10.4s, v25.4s, v3.4s\n"
+    "ldr x16, [%[inptrs], 280]\n"
+    "fmla v24.4s, v26.4s, v8.4s\n"
+    "ldr q28, [x15, x27]\n"
+    "fmla v20.4s, v25.4s, v4.4s\n"
+    "ldr x18, [%[inptrs], 144]\n"
+    "fmla v23.4s, v25.4s, v5.4s\n"
+    "ldr q30, [x16, x27]\n"
+    "fmla v11.4s, v31.4s, v3.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v24.4s, v31.4s, v4.4s\n"
+    "ldr q27, [x25, x27]\n"
+    "fmla v20.4s, v31.4s, v6.4s\n"
+    "ldr x25, [%[inptrs], 8]\n"
+    "fmla v23.4s, v31.4s, v7.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v21.4s, v22.4s, v3.4s\n"
+    "ldr q26, [x17, x27]\n"
+    "fmla v24.4s, v22.4s, v6.4s\n"
+    "ldr x17, [%[inptrs], 56]\n"
+    "fmla v20.4s, v19.4s, v3.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmla v23.4s, v22.4s, v8.4s\n"
+    "ldr q25, [%[wbptr]]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "ldr q22, [%[wbptr], #16]\n"
+    "str q2, [x20, x28]\n"
+    "fmla v24.4s, v28.4s, v3.4s\n"
+    "fmax v17.4s, v17.4s, v29.4s\n"
+    "ldr q9, [%[wbptr], #32]\n"
+    "fmla v23.4s, v19.4s, v4.4s\n"
+    "ldr q8, [%[wbptr], #48]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr q19, [%[wbptr], #64]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 8]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str q18, [x20, x28]\n"
+    "fmla v23.4s, v28.4s, v6.4s\n"
+    "str q16, [x21, x28]\n"
+    "fmax v21.4s, v21.4s, v29.4s\n"
+    "fmax v13.4s, v13.4s, v29.4s\n"
+    "ldr q7, [%[wbptr], #80]\n"
+    "fmax v12.4s, v12.4s, v29.4s\n"
+    "ldr q5, [%[wbptr], #112]\n"
+    "fmla v23.4s, v30.4s, v3.4s\n"
+    "ldr q6, [%[wbptr], #96]\n"
+    "str q13, [x22, x28]\n"
+    "fmax v11.4s, v11.4s, v29.4s\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "ldr q4, [%[wbptr], #128]\n"
+    "fmax v14.4s, v14.4s, v29.4s\n"
+    "ldr q31, [x25, x27]\n"
+    "fmax v10.4s, v10.4s, v29.4s\n"
+    "ldr q3, [%[wbptr], #144]\n"
+    "fmax v20.4s, v20.4s, v29.4s\n"
+    "ldr q28, [x24, x27]\n"
+    "str q14, [x23, x28]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "mov v2.16b, v25.16b\n"
+    "ldr q29, [x17, x27]\n"
+    "ldr x20, [%[outptrs], 16]\n"
+    "ldr x21, [%[outptrs], 40]\n"
+    "ldr x22, [%[outptrs], 72]\n"
+    "ldr x23, [%[outptrs], 104]\n"
+    "ldr x25, [%[inptrs], 16]\n"
+    "ldr x24, [%[inptrs], 104]\n"
+    "str q17, [x20, x28]\n"
+    "mov v16.16b, v25.16b\n"
+    "str q0, [x21, x28]\n"
+    "mov v18.16b, v25.16b\n"
+    "str q12, [x22, x28]\n"
+    "mov v13.16b, v25.16b\n"
+    "str q10, [x23, x28]\n"
+    "mov v0.16b, v25.16b\n"
+    "fmla v2.4s, v27.4s, v22.4s\n"
+    "ldr q30, [x25, x27]\n"
+    "fmla v16.4s, v26.4s, v22.4s\n"
+    "ldr x20, [%[outptrs], 24]\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr x21, [%[outptrs], 48]\n"
+    "str q1, [x20, x28]\n"
+    "mov v14.16b, v25.16b\n"
+    "str q15, [x21, x28]\n"
+    "mov v12.16b, v25.16b\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr x21, [%[outptrs], 56]\n"
+    "fmla v2.4s, v26.4s, v19.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "str q21, [x21, x28]\n"
+    "ldr x22, [%[outptrs], 80]\n"
+    "ldr q21, [x24, x27]\n"
+    "ldr x23, [%[outptrs], 112]\n"
+    "str q11, [x22, x28]\n"
+    "fmla v2.4s, v31.4s, v9.4s\n"
+    "str q20, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 88]\n"
+    "ldr x23, [%[outptrs], 120]\n"
+    "str q24, [x22, x28]\n"
+    "str q23, [x23, x28]\n"
+    "add x28, x28, #16\n"
+    "bne 2b\n"
+    "3:\n"
+    "mov v1.16b, v25.16b\n"
+    "ldr x17, [%[inptrs], 64]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr x25, [%[inptrs], 24]\n"
+    "mov v11.16b, v25.16b\n"
+    "ldr x15, [%[inptrs], 192]\n"
+    "fmla v18.4s, v31.4s, v22.4s\n"
+    "ldr q23, [x17, x27]\n"
+    "fmla v2.4s, v28.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 152]\n"
+    "fmla v16.4s, v28.4s, v19.4s\n"
+    "ldr x24, [%[inptrs], 112]\n"
+    "fmla v13.4s, v28.4s, v22.4s\n"
+    "ldr q26, [x25, x27]\n"
+    "fmla v18.4s, v29.4s, v19.4s\n"
+    "ldr x17, [%[inptrs], 72]\n"
+    "fmla v2.4s, v29.4s, v7.4s\n"
+    "ldr x25, [%[inptrs], 32]\n"
+    "fmla v16.4s, v29.4s, v9.4s\n"
+    "ldr x16, [%[inptrs], 240]\n"
+    "fmla v0.4s, v29.4s, v22.4s\n"
+    "ldr q28, [x15, x27]\n"
+    "fmla v18.4s, v30.4s, v9.4s\n"
+    "ldr x15, [%[inptrs], 200]\n"
+    "fmla v2.4s, v30.4s, v8.4s\n"
+    "ldr x20, [%[outptrs], 0]\n"
+    "fmla v17.4s, v30.4s, v22.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v16.4s, v27.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 160]\n"
+    "fmla v13.4s, v27.4s, v19.4s\n"
+    "ldr x21, [%[outptrs], 32]\n"
+    "fmla v14.4s, v27.4s, v22.4s\n"
+    "ldr q20, [x24, x27]\n"
+    "fmla v2.4s, v21.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 120]\n"
+    "fmla v16.4s, v21.4s, v7.4s\n"
+    "ldr x22, [%[outptrs], 64]\n"
+    "fmla v18.4s, v21.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 96]\n"
+    "fmla v13.4s, v21.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v0.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v12.4s, v21.4s, v22.4s\n"
+    "ldr q24, [x17, x27]\n"
+    "fmla v2.4s, v23.4s, v6.4s\n"
+    "ldr x17, [%[inptrs], 80]\n"
+    "fmla v16.4s, v23.4s, v8.4s\n"
+    "fmla v18.4s, v23.4s, v7.4s\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "fmla v17.4s, v23.4s, v19.4s\n"
+    "fmla v15.4s, v23.4s, v22.4s\n"
+    "ldr q23, [x25, x27]\n"
+    "fmla v1.4s, v26.4s, v22.4s\n"
+    "ldr x25, [%[inptrs], 40]\n"
+    "fmla v18.4s, v26.4s, v8.4s\n"
+    "fmla v13.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v26.4s, v9.4s\n"
+    "ldr q30, [x16, x27]\n"
+    "fmla v14.4s, v28.4s, v19.4s\n"
+    "ldr q26, [x15, x27]\n"
+    "fmla v16.4s, v29.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 248]\n"
+    "fmla v13.4s, v29.4s, v7.4s\n"
+    "ldr x15, [%[inptrs], 208]\n"
+    "fmla v0.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v19.4s\n"
+    "fmla v14.4s, v29.4s, v9.4s\n"
+    "fmla v10.4s, v29.4s, v22.4s\n"
+    "mov v21.16b, v25.16b\n"
+    "fmla v2.4s, v20.4s, v3.4s\n"
+    "fmla v16.4s, v20.4s, v6.4s\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v13.4s, v20.4s, v8.4s\n"
+    "fmla v0.4s, v20.4s, v7.4s\n"
+    "fmla v17.4s, v20.4s, v5.4s\n"
+    "fmla v12.4s, v20.4s, v9.4s\n"
+    "fmla v15.4s, v20.4s, v19.4s\n"
+    "fmla v11.4s, v20.4s, v22.4s\n"
+    "mov v20.16b, v25.16b\n"
+    "fmla v18.4s, v24.4s, v6.4s\n"
+    "fmla v0.4s, v24.4s, v8.4s\n"
+    "fmla v1.4s, v24.4s, v19.4s\n"
+    "fmla v17.4s, v24.4s, v7.4s\n"
+    "fmla v21.4s, v24.4s, v22.4s\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "ldr q30, [x24, x27]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 168]\n"
+    "fmla v17.4s, v23.4s, v8.4s\n"
+    "ldr q31, [x17, x27]\n"
+    "fmla v13.4s, v26.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 128]\n"
+    "fmla v14.4s, v26.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 88]\n"
+    "fmla v12.4s, v26.4s, v5.4s\n"
+    "fmla v10.4s, v26.4s, v19.4s\n"
+    "mov v24.16b, v25.16b\n"
+    "mov v23.16b, v25.16b\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v0.4s, v27.4s, v4.4s\n"
+    "fmla v14.4s, v27.4s, v8.4s\n"
+    "fmla v12.4s, v27.4s, v7.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "fmla v10.4s, v27.4s, v9.4s\n"
+    "fmla v11.4s, v27.4s, v19.4s\n"
+    "fmla v20.4s, v27.4s, v22.4s\n"
+    "ldr q25, [x25, x27]\n"
+    "fmla v18.4s, v30.4s, v3.4s\n"
+    "fmla v0.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "fmla v12.4s, v30.4s, v8.4s\n"
+    "fmla v15.4s, v30.4s, v7.4s\n"
+    "fmla v1.4s, v30.4s, v5.4s\n"
+    "fmla v11.4s, v30.4s, v9.4s\n"
+    "fmla v21.4s, v30.4s, v19.4s\n"
+    "fmla v24.4s, v30.4s, v22.4s\n"
+    "ldr q26, [x16, x27]\n"
+    "fmla v17.4s, v31.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 256]\n"
+    "fmla v15.4s, v31.4s, v8.4s\n"
+    "fmla v1.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v31.4s, v9.4s\n"
+    "ldr q31, [x15, x27]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr x15, [%[inptrs], 216]\n"
+    "fmla v10.4s, v26.4s, v5.4s\n"
+    "ldr q29, [x18, x27]\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "ldr q28, [x24, x27]\n"
+    "fmla v13.4s, v31.4s, v3.4s\n"
+    "ldr x18, [%[inptrs], 176]\n"
+    "fmla v14.4s, v31.4s, v6.4s\n"
+    "ldr x24, [%[inptrs], 136]\n"
+    "fmla v12.4s, v31.4s, v4.4s\n"
+    "fmla v10.4s, v31.4s, v7.4s\n"
+    "fmla v11.4s, v31.4s, v5.4s\n"
+    "fmla v20.4s, v31.4s, v19.4s\n"
+    "fmla v0.4s, v29.4s, v3.4s\n"
+    "ldr q25, [x17, x27]\n"
+    "fmla v15.4s, v29.4s, v4.4s\n"
+    "fmla v21.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v8.4s\n"
+    "fmla v11.4s, v29.4s, v7.4s\n"
+    "fmla v20.4s, v29.4s, v9.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v23.4s, v29.4s, v22.4s\n"
+    "fmla v17.4s, v28.4s, v3.4s\n"
+    "ldr q29, [x16, x27]\n"
+    "fmla v15.4s, v28.4s, v6.4s\n"
+    "ldr q22, [x15, x27]\n"
+    "fmla v1.4s, v28.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 264]\n"
+    "fmla v11.4s, v28.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 224]\n"
+    "fmla v21.4s, v28.4s, v7.4s\n"
+    "fmla v24.4s, v28.4s, v9.4s\n"
+    "fmla v14.4s, v29.4s, v3.4s\n"
+    "ldr q27, [x18, x27]\n"
+    "fmla v1.4s, v25.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 184]\n"
+    "fmla v10.4s, v29.4s, v4.4s\n"
+    "fmla v20.4s, v29.4s, v5.4s\n"
+    "fmla v21.4s, v25.4s, v8.4s\n"
+    "ldr q26, [x24, x27]\n"
+    "fmla v12.4s, v22.4s, v3.4s\n"
+    "ldr q25, [x16, x27]\n"
+    "fmla v11.4s, v22.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 272]\n"
+    "fmla v10.4s, v22.4s, v6.4s\n"
+    "fmla v20.4s, v22.4s, v7.4s\n"
+    "fmla v24.4s, v22.4s, v5.4s\n"
+    "fmla v23.4s, v22.4s, v19.4s\n"
+    "fmla v15.4s, v27.4s, v3.4s\n"
+    "ldr q31, [x15, x27]\n"
+    "fmla v11.4s, v27.4s, v6.4s\n"
+    "ldr q22, [x18, x27]\n"
+    "fmla v21.4s, v27.4s, v4.4s\n"
+    "ldr x15, [%[inptrs], 232]\n"
+    "fmla v20.4s, v27.4s, v8.4s\n"
+    "fmla v24.4s, v27.4s, v7.4s\n"
+    "fmla v23.4s, v27.4s, v9.4s\n"
+    "ldr q19, [x16, x27]\n"
+    "fmla v1.4s, v26.4s, v3.4s\n"
+    "ldr q28, [x15, x27]\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 280]\n"
+    "fmla v24.4s, v26.4s, v8.4s\n"
+    "fmla v10.4s, v25.4s, v3.4s\n"
+    "fmla v20.4s, v25.4s, v4.4s\n"
+    "ldr q30, [x16, x27]\n"
+    "fmla v23.4s, v25.4s, v5.4s\n"
+    "add x27, x27, #16\n"
+    "fmla v11.4s, v31.4s, v3.4s\n"
+    "fmla v21.4s, v22.4s, v3.4s\n"
+    "fmla v24.4s, v31.4s, v4.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v20.4s, v31.4s, v6.4s\n"
+    "fmla v23.4s, v31.4s, v7.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "fmla v24.4s, v22.4s, v6.4s\n"
+    "fmax v17.4s, v17.4s, v29.4s\n"
+    "fmla v20.4s, v19.4s, v3.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "str q2, [x20, x28]\n"
+    "fmla v23.4s, v22.4s, v8.4s\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 8]\n"
+    "fmla v24.4s, v28.4s, v3.4s\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str q18, [x20, x28]\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str q16, [x21, x28]\n"
+    "fmla v23.4s, v19.4s, v4.4s\n"
+    "fmax v21.4s, v21.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 16]\n"
+    "fmax v13.4s, v13.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 40]\n"
+    "str q17, [x20, x28]\n"
+    "fmax v12.4s, v12.4s, v29.4s\n"
+    "str q0, [x21, x28]\n"
+    "fmla v23.4s, v28.4s, v6.4s\n"
+    "str q13, [x22, x28]\n"
+    "fmax v11.4s, v11.4s, v29.4s\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 24]\n"
+    "fmax v14.4s, v14.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 48]\n"
+    "str q1, [x20, x28]\n"
+    "fmla v23.4s, v30.4s, v3.4s\n"
+    "str q15, [x21, x28]\n"
+    "fmax v10.4s, v10.4s, v29.4s\n"
+    "str q14, [x23, x28]\n"
+    "fmax v20.4s, v20.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 56]\n"
+    "ldr x22, [%[outptrs], 72]\n"
+    "ldr x23, [%[outptrs], 104]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str q21, [x21, x28]\n"
+    "str q12, [x22, x28]\n"
+    "str q10, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 80]\n"
+    "ldr x23, [%[outptrs], 112]\n"
+    "str q11, [x22, x28]\n"
+    "str q20, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 88]\n"
+    "ldr x23, [%[outptrs], 120]\n"
+    "str q24, [x22, x28]\n"
+    "str q23, [x23, x28]\n"
+    "add x28, x28, #16\n"
+    "4:\n"
+    "cbz x19, 7f\n"
+    "ldr s25, [%[wbptr]]\n"
+    "mov v2.16b, v25.16b\n"
+    "ldr s22, [%[wbptr], #4]\n"
+    "mov v16.16b, v25.16b\n"
+    "ldr s9, [%[wbptr], #8]\n"
+    "mov v18.16b, v25.16b\n"
+    "ldr s8, [%[wbptr], #12]\n"
+    "mov v13.16b, v25.16b\n"
+    "ldr s19, [%[wbptr], #16]\n"
+    "mov v0.16b, v25.16b\n"
+    "ldr s7, [%[wbptr], #20]\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr s6, [%[wbptr], #24]\n"
+    "mov v14.16b, v25.16b\n"
+    "ldr s5, [%[wbptr], #28]\n"
+    "mov v12.16b, v25.16b\n"
+    "ldr s4, [%[wbptr], #32]\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr s3, [%[wbptr], #36]\n"
+    "ldr x25, [%[inptrs], 0]\n"
+    "ldr x17, [%[inptrs], 48]\n"
+    "ldr x24, [%[inptrs], 96]\n"
+    "ldr x18, [%[inptrs], 144]\n"
+    "subs x19, x19, #1\n"
+    "ldr s27, [x25, x27]\n"
+    "fmla v2.4s, v27.4s, v22.4s\n"
+    "ldr s26, [x17, x27]\n"
+    "fmla v16.4s, v26.4s, v22.4s\n"
+    "ldr s28, [x24, x27]\n"
+    "ldr s27, [x18, x27]\n"
+    "ldr x25, [%[inptrs], 8]\n"
+    "ldr x17, [%[inptrs], 56]\n"
+    "ldr x24, [%[inptrs], 104]\n"
+    "ldr s31, [x25, x27]\n"
+    "fmla v2.4s, v26.4s, v19.4s\n"
+    "ldr s29, [x17, x27]\n"
+    "ldr s21, [x24, x27]\n"
+    "ldr x25, [%[inptrs], 16]\n"
+    "ldr s30, [x25, x27]\n"
+    "fmla v2.4s, v31.4s, v9.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "mov v1.16b, v25.16b\n"
+    "ldr x17, [%[inptrs], 64]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr x25, [%[inptrs], 24]\n"
+    "fmla v18.4s, v31.4s, v22.4s\n"
+    "ldr s23, [x17, x27]\n"
+    "fmla v2.4s, v28.4s, v5.4s\n"
+    "ldr x15, [%[inptrs], 192]\n"
+    "fmla v16.4s, v28.4s, v19.4s\n"
+    "ldr x18, [%[inptrs], 152]\n"
+    "fmla v13.4s, v28.4s, v22.4s\n"
+    "ldr s26, [x25, x27]\n"
+    "fmla v18.4s, v29.4s, v19.4s\n"
+    "ldr x24, [%[inptrs], 112]\n"
+    "fmla v2.4s, v29.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 72]\n"
+    "fmla v16.4s, v29.4s, v9.4s\n"
+    "ldr x25, [%[inptrs], 32]\n"
+    "fmla v0.4s, v29.4s, v22.4s\n"
+    "ldr s28, [x15, x27]\n"
+    "fmla v18.4s, v30.4s, v9.4s\n"
+    "ldr x16, [%[inptrs], 240]\n"
+    "fmla v2.4s, v30.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 200]\n"
+    "fmla v17.4s, v30.4s, v22.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v16.4s, v27.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 160]\n"
+    "fmla v13.4s, v27.4s, v19.4s\n"
+    "ldr x20, [%[outptrs], 0]\n"
+    "fmla v14.4s, v27.4s, v22.4s\n"
+    "ldr s20, [x24, x27]\n"
+    "fmla v2.4s, v21.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 120]\n"
+    "fmla v16.4s, v21.4s, v7.4s\n"
+    "ldr x21, [%[outptrs], 32]\n"
+    "fmla v18.4s, v21.4s, v5.4s\n"
+    "ldr x22, [%[outptrs], 64]\n"
+    "fmla v13.4s, v21.4s, v9.4s\n"
+    "ldr x23, [%[outptrs], 96]\n"
+    "fmla v0.4s, v21.4s, v19.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v12.4s, v21.4s, v22.4s\n"
+    "ldr s24, [x17, x27]\n"
+    "fmla v2.4s, v23.4s, v6.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v16.4s, v23.4s, v8.4s\n"
+    "ldr x17, [%[inptrs], 80]\n"
+    "fmla v18.4s, v23.4s, v7.4s\n"
+    "subs x19, x19, #1\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "fmla v17.4s, v23.4s, v19.4s\n"
+    "fmla v15.4s, v23.4s, v22.4s\n"
+    "ldr s23, [x25, x27]\n"
+    "fmla v1.4s, v26.4s, v22.4s\n"
+    "ldr x25, [%[inptrs], 40]\n"
+    "fmla v18.4s, v26.4s, v8.4s\n"
+    "fmla v13.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v26.4s, v9.4s\n"
+    "ldr s30, [x16, x27]\n"
+    "fmla v14.4s, v28.4s, v19.4s\n"
+    "ldr s26, [x15, x27]\n"
+    "fmla v16.4s, v29.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 248]\n"
+    "fmla v13.4s, v29.4s, v7.4s\n"
+    "ldr x15, [%[inptrs], 208]\n"
+    "fmla v0.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v19.4s\n"
+    "fmla v14.4s, v29.4s, v9.4s\n"
+    "fmla v10.4s, v29.4s, v22.4s\n"
+    "mov v11.16b, v25.16b\n"
+    "fmla v2.4s, v20.4s, v3.4s\n"
+    "fmla v16.4s, v20.4s, v6.4s\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v13.4s, v20.4s, v8.4s\n"
+    "fmla v0.4s, v20.4s, v7.4s\n"
+    "fmla v17.4s, v20.4s, v5.4s\n"
+    "fmla v12.4s, v20.4s, v9.4s\n"
+    "fmla v15.4s, v20.4s, v19.4s\n"
+    "fmla v11.4s, v20.4s, v22.4s\n"
+    "mov v21.16b, v25.16b\n"
+    "fmla v18.4s, v24.4s, v6.4s\n"
+    "fmla v0.4s, v24.4s, v8.4s\n"
+    "fmla v1.4s, v24.4s, v19.4s\n"
+    "fmla v17.4s, v24.4s, v7.4s\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "mov v20.16b, v25.16b\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "fmla v21.4s, v24.4s, v22.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 168]\n"
+    "fmla v17.4s, v23.4s, v8.4s\n"
+    "ldr s30, [x24, x27]\n"
+    "fmla v13.4s, v26.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 128]\n"
+    "fmla v14.4s, v26.4s, v7.4s\n"
+    "fmla v12.4s, v26.4s, v5.4s\n"
+    "fmla v10.4s, v26.4s, v19.4s\n"
+    "ldr s31, [x17, x27]\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "ldr x17, [%[inptrs], 88]\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v0.4s, v27.4s, v4.4s\n"
+    "fmla v14.4s, v27.4s, v8.4s\n"
+    "fmla v12.4s, v27.4s, v7.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "fmla v10.4s, v27.4s, v9.4s\n"
+    "fmla v11.4s, v27.4s, v19.4s\n"
+    "fmla v20.4s, v27.4s, v22.4s\n"
+    "mov v24.16b, v25.16b\n"
+    "mov v23.16b, v25.16b\n"
+    "fmla v18.4s, v30.4s, v3.4s\n"
+    "fmla v0.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "fmla v12.4s, v30.4s, v8.4s\n"
+    "fmla v15.4s, v30.4s, v7.4s\n"
+    "fmla v1.4s, v30.4s, v5.4s\n"
+    "fmla v11.4s, v30.4s, v9.4s\n"
+    "fmla v21.4s, v30.4s, v19.4s\n"
+    "fmla v24.4s, v30.4s, v22.4s\n"
+    "ldr s25, [x25, x27]\n"
+    "fmla v17.4s, v31.4s, v6.4s\n"
+    "ldr x25, [%[inptrs], 0]\n"
+    "fmla v15.4s, v31.4s, v8.4s\n"
+    "fmla v1.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v31.4s, v9.4s\n"
+    "ldr s26, [x16, x27]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 256]\n"
+    "fmla v10.4s, v26.4s, v5.4s\n"
+    "ldr s31, [x15, x27]\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v13.4s, v31.4s, v3.4s\n"
+    "ldr x15, [%[inptrs], 216]\n"
+    "fmla v14.4s, v31.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 176]\n"
+    "fmla v12.4s, v31.4s, v4.4s\n"
+    "fmla v10.4s, v31.4s, v7.4s\n"
+    "fmla v11.4s, v31.4s, v5.4s\n"
+    "fmla v20.4s, v31.4s, v19.4s\n"
+    "fmla v0.4s, v29.4s, v3.4s\n"
+    "ldr s28, [x24, x27]\n"
+    "fmla v15.4s, v29.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 136]\n"
+    "fmla v12.4s, v29.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v8.4s\n"
+    "fmla v11.4s, v29.4s, v7.4s\n"
+    "fmla v21.4s, v29.4s, v5.4s\n"
+    "fmla v20.4s, v29.4s, v9.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v23.4s, v29.4s, v22.4s\n"
+    "ldr s25, [x17, x27]\n"
+    "fmla v17.4s, v28.4s, v3.4s\n"
+    "ldr s29, [x16, x27]\n"
+    "fmla v15.4s, v28.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 264]\n"
+    "fmla v1.4s, v28.4s, v4.4s\n"
+    "ldr x17, [%[inptrs], 48]\n"
+    "fmla v11.4s, v28.4s, v8.4s\n"
+    "fmla v21.4s, v28.4s, v7.4s\n"
+    "fmla v24.4s, v28.4s, v9.4s\n"
+    "ldr s22, [x15, x27]\n"
+    "fmla v14.4s, v29.4s, v3.4s\n"
+    "ldr x15, [%[inptrs], 224]\n"
+    "fmla v1.4s, v25.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v4.4s\n"
+    "fmla v21.4s, v25.4s, v8.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "fmla v20.4s, v29.4s, v5.4s\n"
+    "ldr s26, [x24, x27]\n"
+    "fmla v12.4s, v22.4s, v3.4s\n"
+    "ldr x18, [%[inptrs], 184]\n"
+    "fmla v10.4s, v22.4s, v6.4s\n"
+    "ldr x24, [%[inptrs], 96]\n"
+    "fmla v11.4s, v22.4s, v4.4s\n"
+    "fmla v24.4s, v22.4s, v5.4s\n"
+    "fmla v20.4s, v22.4s, v7.4s\n"
+    "fmla v23.4s, v22.4s, v19.4s\n"
+    "fmla v15.4s, v27.4s, v3.4s\n"
+    "ldr s25, [x16, x27]\n"
+    "fmla v21.4s, v27.4s, v4.4s\n"
+    "ldr s31, [x15, x27]\n"
+    "fmla v11.4s, v27.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 272]\n"
+    "fmla v20.4s, v27.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 232]\n"
+    "fmla v24.4s, v27.4s, v7.4s\n"
+    "fmla v23.4s, v27.4s, v9.4s\n"
+    "fmla v1.4s, v26.4s, v3.4s\n"
+    "ldr s22, [x18, x27]\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr s19, [x16, x27]\n"
+    "fmla v10.4s, v25.4s, v3.4s\n"
+    "ldr x16, [%[inptrs], 280]\n"
+    "fmla v24.4s, v26.4s, v8.4s\n"
+    "ldr s28, [x15, x27]\n"
+    "fmla v20.4s, v25.4s, v4.4s\n"
+    "ldr x18, [%[inptrs], 144]\n"
+    "fmla v23.4s, v25.4s, v5.4s\n"
+    "ldr s30, [x16, x27]\n"
+    "fmla v11.4s, v31.4s, v3.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v24.4s, v31.4s, v4.4s\n"
+    "ldr s27, [x25, x27]\n"
+    "fmla v20.4s, v31.4s, v6.4s\n"
+    "ldr x25, [%[inptrs], 8]\n"
+    "fmla v23.4s, v31.4s, v7.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v21.4s, v22.4s, v3.4s\n"
+    "ldr s26, [x17, x27]\n"
+    "fmla v24.4s, v22.4s, v6.4s\n"
+    "ldr x17, [%[inptrs], 56]\n"
+    "fmla v20.4s, v19.4s, v3.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmla v23.4s, v22.4s, v8.4s\n"
+    "ldr s25, [%[wbptr]]\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "ldr s22, [%[wbptr], #4]\n"
+    "str s2, [x20, x28]\n"
+    "fmla v24.4s, v28.4s, v3.4s\n"
+    "fmax v17.4s, v17.4s, v29.4s\n"
+    "ldr s9, [%[wbptr], #8]\n"
+    "fmla v23.4s, v19.4s, v4.4s\n"
+    "ldr s8, [%[wbptr], #12]\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "ldr s19, [%[wbptr], #16]\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 8]\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str s18, [x20, x28]\n"
+    "fmla v23.4s, v28.4s, v6.4s\n"
+    "str s16, [x21, x28]\n"
+    "fmax v21.4s, v21.4s, v29.4s\n"
+    "fmax v13.4s, v13.4s, v29.4s\n"
+    "ldr s7, [%[wbptr], #20]\n"
+    "fmax v12.4s, v12.4s, v29.4s\n"
+    "ldr s5, [%[wbptr], #28]\n"
+    "fmla v23.4s, v30.4s, v3.4s\n"
+    "ldr s6, [%[wbptr], #24]\n"
+    "str s13, [x22, x28]\n"
+    "fmax v11.4s, v11.4s, v29.4s\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "ldr s4, [%[wbptr], #32]\n"
+    "fmax v14.4s, v14.4s, v29.4s\n"
+    "ldr s31, [x25, x27]\n"
+    "fmax v10.4s, v10.4s, v29.4s\n"
+    "ldr s3, [%[wbptr], #36]\n"
+    "fmax v20.4s, v20.4s, v29.4s\n"
+    "ldr s28, [x24, x27]\n"
+    "str s14, [x23, x28]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "mov v2.16b, v25.16b\n"
+    "ldr s29, [x17, x27]\n"
+    "ldr x20, [%[outptrs], 16]\n"
+    "ldr x21, [%[outptrs], 40]\n"
+    "ldr x22, [%[outptrs], 72]\n"
+    "ldr x23, [%[outptrs], 104]\n"
+    "ldr x25, [%[inptrs], 16]\n"
+    "ldr x24, [%[inptrs], 104]\n"
+    "str s17, [x20, x28]\n"
+    "mov v16.16b, v25.16b\n"
+    "str s0, [x21, x28]\n"
+    "mov v18.16b, v25.16b\n"
+    "str s12, [x22, x28]\n"
+    "mov v13.16b, v25.16b\n"
+    "str s10, [x23, x28]\n"
+    "mov v0.16b, v25.16b\n"
+    "fmla v2.4s, v27.4s, v22.4s\n"
+    "ldr s30, [x25, x27]\n"
+    "fmla v16.4s, v26.4s, v22.4s\n"
+    "ldr x20, [%[outptrs], 24]\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr x21, [%[outptrs], 48]\n"
+    "str s1, [x20, x28]\n"
+    "mov v14.16b, v25.16b\n"
+    "str s15, [x21, x28]\n"
+    "mov v12.16b, v25.16b\n"
+    "mov v15.16b, v25.16b\n"
+    "ldr x21, [%[outptrs], 56]\n"
+    "fmla v2.4s, v26.4s, v19.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "str s21, [x21, x28]\n"
+    "ldr x22, [%[outptrs], 80]\n"
+    "ldr s21, [x24, x27]\n"
+    "ldr x23, [%[outptrs], 112]\n"
+    "str s11, [x22, x28]\n"
+    "fmla v2.4s, v31.4s, v9.4s\n"
+    "str s20, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 88]\n"
+    "ldr x23, [%[outptrs], 120]\n"
+    "str s24, [x22, x28]\n"
+    "str s23, [x23, x28]\n"
+    "add x28, x28, #4\n"
+    "bne 5b\n"
+    "6:\n"
+    "mov v1.16b, v25.16b\n"
+    "ldr x17, [%[inptrs], 64]\n"
+    "mov v10.16b, v25.16b\n"
+    "ldr x25, [%[inptrs], 24]\n"
+    "mov v11.16b, v25.16b\n"
+    "ldr x15, [%[inptrs], 192]\n"
+    "fmla v18.4s, v31.4s, v22.4s\n"
+    "ldr s23, [x17, x27]\n"
+    "fmla v2.4s, v28.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 152]\n"
+    "fmla v16.4s, v28.4s, v19.4s\n"
+    "ldr x24, [%[inptrs], 112]\n"
+    "fmla v13.4s, v28.4s, v22.4s\n"
+    "ldr s26, [x25, x27]\n"
+    "fmla v18.4s, v29.4s, v19.4s\n"
+    "ldr x17, [%[inptrs], 72]\n"
+    "fmla v2.4s, v29.4s, v7.4s\n"
+    "ldr x25, [%[inptrs], 32]\n"
+    "fmla v16.4s, v29.4s, v9.4s\n"
+    "ldr x16, [%[inptrs], 240]\n"
+    "fmla v0.4s, v29.4s, v22.4s\n"
+    "ldr s28, [x15, x27]\n"
+    "fmla v18.4s, v30.4s, v9.4s\n"
+    "ldr x15, [%[inptrs], 200]\n"
+    "fmla v2.4s, v30.4s, v8.4s\n"
+    "ldr x20, [%[outptrs], 0]\n"
+    "fmla v17.4s, v30.4s, v22.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v16.4s, v27.4s, v5.4s\n"
+    "ldr x18, [%[inptrs], 160]\n"
+    "fmla v13.4s, v27.4s, v19.4s\n"
+    "ldr x21, [%[outptrs], 32]\n"
+    "fmla v14.4s, v27.4s, v22.4s\n"
+    "ldr s20, [x24, x27]\n"
+    "fmla v2.4s, v21.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 120]\n"
+    "fmla v16.4s, v21.4s, v7.4s\n"
+    "ldr x22, [%[outptrs], 64]\n"
+    "fmla v18.4s, v21.4s, v5.4s\n"
+    "ldr x23, [%[outptrs], 96]\n"
+    "fmla v13.4s, v21.4s, v9.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v0.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v12.4s, v21.4s, v22.4s\n"
+    "ldr s24, [x17, x27]\n"
+    "fmla v2.4s, v23.4s, v6.4s\n"
+    "ldr x17, [%[inptrs], 80]\n"
+    "fmla v16.4s, v23.4s, v8.4s\n"
+    "fmla v18.4s, v23.4s, v7.4s\n"
+    "fmla v0.4s, v23.4s, v9.4s\n"
+    "fmla v17.4s, v23.4s, v19.4s\n"
+    "fmla v15.4s, v23.4s, v22.4s\n"
+    "ldr s23, [x25, x27]\n"
+    "fmla v1.4s, v26.4s, v22.4s\n"
+    "ldr x25, [%[inptrs], 40]\n"
+    "fmla v18.4s, v26.4s, v8.4s\n"
+    "fmla v13.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v26.4s, v9.4s\n"
+    "ldr s30, [x16, x27]\n"
+    "fmla v14.4s, v28.4s, v19.4s\n"
+    "ldr s26, [x15, x27]\n"
+    "fmla v16.4s, v29.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 248]\n"
+    "fmla v13.4s, v29.4s, v7.4s\n"
+    "ldr x15, [%[inptrs], 208]\n"
+    "fmla v0.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v19.4s\n"
+    "fmla v14.4s, v29.4s, v9.4s\n"
+    "fmla v10.4s, v29.4s, v22.4s\n"
+    "mov v21.16b, v25.16b\n"
+    "fmla v2.4s, v20.4s, v3.4s\n"
+    "fmla v16.4s, v20.4s, v6.4s\n"
+    "fmla v18.4s, v20.4s, v4.4s\n"
+    "fmla v13.4s, v20.4s, v8.4s\n"
+    "fmla v0.4s, v20.4s, v7.4s\n"
+    "fmla v17.4s, v20.4s, v5.4s\n"
+    "fmla v12.4s, v20.4s, v9.4s\n"
+    "fmla v15.4s, v20.4s, v19.4s\n"
+    "fmla v11.4s, v20.4s, v22.4s\n"
+    "mov v20.16b, v25.16b\n"
+    "fmla v18.4s, v24.4s, v6.4s\n"
+    "fmla v0.4s, v24.4s, v8.4s\n"
+    "fmla v1.4s, v24.4s, v19.4s\n"
+    "fmla v17.4s, v24.4s, v7.4s\n"
+    "fmla v21.4s, v24.4s, v22.4s\n"
+    "fmla v15.4s, v24.4s, v9.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "fmla v14.4s, v30.4s, v5.4s\n"
+    "ldr s30, [x24, x27]\n"
+    "fmla v1.4s, v23.4s, v9.4s\n"
+    "ldr x18, [%[inptrs], 168]\n"
+    "fmla v17.4s, v23.4s, v8.4s\n"
+    "ldr s31, [x17, x27]\n"
+    "fmla v13.4s, v26.4s, v4.4s\n"
+    "ldr x24, [%[inptrs], 128]\n"
+    "fmla v14.4s, v26.4s, v7.4s\n"
+    "ldr x17, [%[inptrs], 88]\n"
+    "fmla v12.4s, v26.4s, v5.4s\n"
+    "fmla v10.4s, v26.4s, v19.4s\n"
+    "mov v24.16b, v25.16b\n"
+    "mov v23.16b, v25.16b\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v0.4s, v27.4s, v4.4s\n"
+    "fmla v14.4s, v27.4s, v8.4s\n"
+    "fmla v12.4s, v27.4s, v7.4s\n"
+    "fmla v15.4s, v27.4s, v5.4s\n"
+    "fmla v10.4s, v27.4s, v9.4s\n"
+    "fmla v11.4s, v27.4s, v19.4s\n"
+    "fmla v20.4s, v27.4s, v22.4s\n"
+    "ldr s25, [x25, x27]\n"
+    "fmla v18.4s, v30.4s, v3.4s\n"
+    "fmla v0.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v4.4s\n"
+    "fmla v12.4s, v30.4s, v8.4s\n"
+    "fmla v15.4s, v30.4s, v7.4s\n"
+    "fmla v1.4s, v30.4s, v5.4s\n"
+    "fmla v11.4s, v30.4s, v9.4s\n"
+    "fmla v21.4s, v30.4s, v19.4s\n"
+    "fmla v24.4s, v30.4s, v22.4s\n"
+    "ldr s26, [x16, x27]\n"
+    "fmla v17.4s, v31.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 256]\n"
+    "fmla v15.4s, v31.4s, v8.4s\n"
+    "fmla v1.4s, v31.4s, v7.4s\n"
+    "fmla v21.4s, v31.4s, v9.4s\n"
+    "ldr s31, [x15, x27]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "ldr x15, [%[inptrs], 216]\n"
+    "fmla v10.4s, v26.4s, v5.4s\n"
+    "ldr s29, [x18, x27]\n"
+    "fmla v1.4s, v25.4s, v8.4s\n"
+    "ldr s28, [x24, x27]\n"
+    "fmla v13.4s, v31.4s, v3.4s\n"
+    "ldr x18, [%[inptrs], 176]\n"
+    "fmla v14.4s, v31.4s, v6.4s\n"
+    "ldr x24, [%[inptrs], 136]\n"
+    "fmla v12.4s, v31.4s, v4.4s\n"
+    "fmla v10.4s, v31.4s, v7.4s\n"
+    "fmla v11.4s, v31.4s, v5.4s\n"
+    "fmla v20.4s, v31.4s, v19.4s\n"
+    "fmla v0.4s, v29.4s, v3.4s\n"
+    "ldr s25, [x17, x27]\n"
+    "fmla v15.4s, v29.4s, v4.4s\n"
+    "fmla v21.4s, v29.4s, v5.4s\n"
+    "fmla v12.4s, v29.4s, v6.4s\n"
+    "fmla v10.4s, v29.4s, v8.4s\n"
+    "fmla v11.4s, v29.4s, v7.4s\n"
+    "fmla v20.4s, v29.4s, v9.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v23.4s, v29.4s, v22.4s\n"
+    "fmla v17.4s, v28.4s, v3.4s\n"
+    "ldr s29, [x16, x27]\n"
+    "fmla v15.4s, v28.4s, v6.4s\n"
+    "ldr s22, [x15, x27]\n"
+    "fmla v1.4s, v28.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 264]\n"
+    "fmla v11.4s, v28.4s, v8.4s\n"
+    "ldr x15, [%[inptrs], 224]\n"
+    "fmla v21.4s, v28.4s, v7.4s\n"
+    "fmla v24.4s, v28.4s, v9.4s\n"
+    "fmla v14.4s, v29.4s, v3.4s\n"
+    "ldr s27, [x18, x27]\n"
+    "fmla v1.4s, v25.4s, v6.4s\n"
+    "ldr x18, [%[inptrs], 184]\n"
+    "fmla v10.4s, v29.4s, v4.4s\n"
+    "fmla v20.4s, v29.4s, v5.4s\n"
+    "fmla v21.4s, v25.4s, v8.4s\n"
+    "ldr s26, [x24, x27]\n"
+    "fmla v12.4s, v22.4s, v3.4s\n"
+    "ldr s25, [x16, x27]\n"
+    "fmla v11.4s, v22.4s, v4.4s\n"
+    "ldr x16, [%[inptrs], 272]\n"
+    "fmla v10.4s, v22.4s, v6.4s\n"
+    "fmla v20.4s, v22.4s, v7.4s\n"
+    "fmla v24.4s, v22.4s, v5.4s\n"
+    "fmla v23.4s, v22.4s, v19.4s\n"
+    "fmla v15.4s, v27.4s, v3.4s\n"
+    "ldr s31, [x15, x27]\n"
+    "fmla v11.4s, v27.4s, v6.4s\n"
+    "ldr s22, [x18, x27]\n"
+    "fmla v21.4s, v27.4s, v4.4s\n"
+    "ldr x15, [%[inptrs], 232]\n"
+    "fmla v20.4s, v27.4s, v8.4s\n"
+    "fmla v24.4s, v27.4s, v7.4s\n"
+    "fmla v23.4s, v27.4s, v9.4s\n"
+    "ldr s19, [x16, x27]\n"
+    "fmla v1.4s, v26.4s, v3.4s\n"
+    "ldr s28, [x15, x27]\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr x16, [%[inptrs], 280]\n"
+    "fmla v24.4s, v26.4s, v8.4s\n"
+    "fmla v10.4s, v25.4s, v3.4s\n"
+    "fmla v20.4s, v25.4s, v4.4s\n"
+    "ldr s30, [x16, x27]\n"
+    "fmla v23.4s, v25.4s, v5.4s\n"
+    "add x27, x27, #4\n"
+    "fmla v11.4s, v31.4s, v3.4s\n"
+    "fmla v21.4s, v22.4s, v3.4s\n"
+    "fmla v24.4s, v31.4s, v4.4s\n"
+    "movi v29.16b, #0\n"
+    "fmla v20.4s, v31.4s, v6.4s\n"
+    "fmla v23.4s, v31.4s, v7.4s\n"
+    "fmax v2.4s, v2.4s, v29.4s\n"
+    "fmax v18.4s, v18.4s, v29.4s\n"
+    "fmla v24.4s, v22.4s, v6.4s\n"
+    "fmax v17.4s, v17.4s, v29.4s\n"
+    "fmla v20.4s, v19.4s, v3.4s\n"
+    "fmax v1.4s, v1.4s, v29.4s\n"
+    "str s2, [x20, x28]\n"
+    "fmla v23.4s, v22.4s, v8.4s\n"
+    "fmax v16.4s, v16.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 8]\n"
+    "fmla v24.4s, v28.4s, v3.4s\n"
+    "fmax v0.4s, v0.4s, v29.4s\n"
+    "str s18, [x20, x28]\n"
+    "fmax v15.4s, v15.4s, v29.4s\n"
+    "str s16, [x21, x28]\n"
+    "fmla v23.4s, v19.4s, v4.4s\n"
+    "fmax v21.4s, v21.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 16]\n"
+    "fmax v13.4s, v13.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 40]\n"
+    "str s17, [x20, x28]\n"
+    "fmax v12.4s, v12.4s, v29.4s\n"
+    "str s0, [x21, x28]\n"
+    "fmla v23.4s, v28.4s, v6.4s\n"
+    "str s13, [x22, x28]\n"
+    "fmax v11.4s, v11.4s, v29.4s\n"
+    "fmax v24.4s, v24.4s, v29.4s\n"
+    "ldr x20, [%[outptrs], 24]\n"
+    "fmax v14.4s, v14.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 48]\n"
+    "str s1, [x20, x28]\n"
+    "fmla v23.4s, v30.4s, v3.4s\n"
+    "str s15, [x21, x28]\n"
+    "fmax v10.4s, v10.4s, v29.4s\n"
+    "str s14, [x23, x28]\n"
+    "fmax v20.4s, v20.4s, v29.4s\n"
+    "ldr x21, [%[outptrs], 56]\n"
+    "ldr x22, [%[outptrs], 72]\n"
+    "ldr x23, [%[outptrs], 104]\n"
+    "fmax v23.4s, v23.4s, v29.4s\n"
+    "str s21, [x21, x28]\n"
+    "str s12, [x22, x28]\n"
+    "str s10, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 80]\n"
+    "ldr x23, [%[outptrs], 112]\n"
+    "str s11, [x22, x28]\n"
+    "str s20, [x23, x28]\n"
+    "ldr x22, [%[outptrs], 88]\n"
+    "ldr x23, [%[outptrs], 120]\n"
+    "str s24, [x22, x28]\n"
+    "str s23, [x23, x28]\n"
+    "add x28, x28, #4\n"
+    "7:\n"
+    : [wbptr] "+r" (weight_bias_ptr)
+    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
 
-      "vptr1 .req x5\n"
-      "vptr2 .req x6\n"
-      "vptr3 .req x7\n"
-
-      "wptr1 .req x8\n"
-      "wptr2 .req x9\n"
-
-      // Prepare pointers and strides
-      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
-      "add uptr2,    uptr1 , %x[u_row_stride]\n"
-      "add uptr3,    uptr2 , %x[u_row_stride]\n"
-      "add uptr4,    uptr3 , %x[u_row_stride]\n"
-      "add uptr5,    uptr4 , %x[u_row_stride]\n"
-
-      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
-      "add vptr2,    vptr1 , %x[v_row_stride]\n"
-      "add vptr3,    vptr2 , %x[v_row_stride]\n"
-
-      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
-      "add wptr2,    wptr1 , %x[w_row_stride]\n"
-
-      // Load initial operands
-      "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
-      "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
-      "subs %x[c4_rem], %x[c4_rem], #1\n"
-      "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
-      "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
-      "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
-      "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
-      "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
-      "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
-      "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
-      "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
-      "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
-      "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
-      "ldr qW11, [%x[wptr0]], #0x10\n"
-      "fmul vV14.4s, vU16.4s, vW13.4s\n"
-      "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
-      "fmul vV13.4s, vU15.4s, vW13.4s\n"
-      "ldr qW31, [wptr2], #0x10\n"
-      "fmla vV14.4s, vU15.4s, vW12.4s\n"
-      "ldr qW21, [wptr1], #0x10\n"
-      "fmul vV12.4s, vU14.4s, vW13.4s\n"
-      "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
-      "fmla vV13.4s, vU14.4s, vW12.4s\n"
-      "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
-      "fmla vV14.4s, vU14.4s, vW11.4s\n"
-      "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
-      "fmla vV14.4s, vU26.4s, vW23.4s\n"
-      "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
-      "fmul vV24.4s, vU26.4s, vW13.4s\n"
-      "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
-      "fmla vV13.4s, vU25.4s, vW23.4s\n"
-      "beq 2f\n"  // Single iteration only
-
-      "1:"  // Loop body
-        "fmla vV14.4s, vU25.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[wptr0], %[prftch]]\n"
-        "fmul vV23.4s, vU25.4s, vW13.4s\n"
-        "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV24.4s, vU25.4s, vW12.4s\n"
-        "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
-        "fmla vV12.4s, vU24.4s, vW23.4s\n"
-        "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV13.4s, vU24.4s, vW22.4s\n"
-        "prfm pldl1keep, [   wptr1 , %[prftch]]\n"
-        "fmla vV14.4s, vU24.4s, vW21.4s\n"
-        "prfm pldl1keep, [   wptr1 , %x[prftch_uvw_col_stride1]]\n"
-        "fmul vV22.4s, vU24.4s, vW13.4s\n"
-        "prfm pldl1keep, [   wptr1 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV23.4s, vU24.4s, vW12.4s\n"
-        "prfm pldl1keep, [   wptr2 , %x[prftch]]\n"
-        "fmla vV24.4s, vU24.4s, vW11.4s\n"
-        "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
-        "fmla vV14.4s, vU36.4s, vW33.4s\n"
-        "prfm pldl1keep, [   wptr2 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV24.4s, vU36.4s, vW23.4s\n"
-        "prfm pldl1keep, [   wptr2 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmul vV34.4s, vU36.4s, vW13.4s\n"
-        "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
-        "fmla vV13.4s, vU35.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV14.4s, vU35.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV23.4s, vU35.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV24.4s, vU35.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride4] ]\n"
-        "fmul vV33.4s, vU35.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV34.4s, vU35.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
-        "fmla vV12.4s, vU34.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr3 , %[prftch]]\n"
-        "fmla vV13.4s, vU34.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV14.4s, vU34.4s, vW31.4s\n"
-        "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
-        "fmla vV22.4s, vU34.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV23.4s, vU34.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV24.4s, vU34.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride4] ]\n"
-        "fmul vV32.4s, vU34.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV33.4s, vU34.4s, vW12.4s\n"
-        "prfm pldl1keep, [   uptr4 , %[prftch]]\n"
-        "fmla vV34.4s, vU34.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
-        "fmla vV24.4s, vU46.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV34.4s, vU46.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmul vV44.4s, vU46.4s, vW13.4s\n"
-        "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
-        "fmla vV23.4s, vU45.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV24.4s, vU45.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride4] ]\n"
-        "fmla vV33.4s, vU45.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV34.4s, vU45.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr5 , %[prftch]]\n"
-        "fmul vV43.4s, vU45.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV44.4s, vU45.4s, vW12.4s\n"
-        "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
-        "fmla vV22.4s, vU44.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV23.4s, vU44.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV24.4s, vU44.4s, vW31.4s\n"
-        "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
-        "fmla vV32.4s, vU44.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride4] ]\n"
-        "fmla vV33.4s, vU44.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV34.4s, vU44.4s, vW21.4s\n"
-        "prfm pstl1keep, [%x[vptr0], %[prftch]]\n"
-        "fmul vV42.4s, vU44.4s, vW13.4s\n"
-        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV43.4s, vU44.4s, vW12.4s\n"
-        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV44.4s, vU44.4s, vW11.4s\n"
-        "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
-        "fmla vV34.4s, vU56.4s, vW33.4s\n"
-        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV44.4s, vU56.4s, vW23.4s\n"
-        "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
-        "fmla vV33.4s, vU55.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr1 , %[prftch]]\n"
-        "fmla vV34.4s, vU55.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV43.4s, vU55.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV44.4s, vU55.4s, vW22.4s\n"
-        "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
-        "fmla vV32.4s, vU54.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV33.4s, vU54.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr2 , %[prftch]]\n"
-        "fmla vV34.4s, vU54.4s, vW31.4s\n"
-        "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
-        "fmla vV42.4s, vU54.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV43.4s, vU54.4s, vW22.4s\n"
-        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV44.4s, vU54.4s, vW21.4s\n"
-        "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
-        "fmla vV44.4s, vU66.4s, vW33.4s\n"
-        "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
-        "fmla vV43.4s, vU65.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV44.4s, vU65.4s, vW32.4s\n"
-        "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
-        "fmla vV42.4s, vU64.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr3 , %[prftch]]\n"
-        "fmla vV43.4s, vU64.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV44.4s, vU64.4s, vW31.4s\n"
-        "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
-        "fmla vV12.4s, vU13.4s, vW12.4s\n"
-        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV13.4s, vU13.4s, vW11.4s\n"
-        "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV12.4s, vU23.4s, vW22.4s\n"
-        "fmla vV13.4s, vU23.4s, vW21.4s\n"
-        "fmul vV21.4s, vU23.4s, vW13.4s\n"
-        "fmla vV22.4s, vU23.4s, vW12.4s\n"
-        "fmla vV23.4s, vU23.4s, vW11.4s\n"
-        "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "fmla vV12.4s, vU33.4s, vW32.4s\n"
-        "fmla vV13.4s, vU33.4s, vW31.4s\n"
-        "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
-        "fmla vV21.4s, vU33.4s, vW23.4s\n"
-        "fmla vV22.4s, vU33.4s, vW22.4s\n"
-        "fmla vV23.4s, vU33.4s, vW21.4s\n"
-        "fmul vV31.4s, vU33.4s, vW13.4s\n"
-        "fmla vV32.4s, vU33.4s, vW12.4s\n"
-        "fmla vV33.4s, vU33.4s, vW11.4s\n"
-        "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
-        "fmla vV21.4s, vU43.4s, vW33.4s\n"
-        "fmla vV22.4s, vU43.4s, vW32.4s\n"
-        "fmla vV23.4s, vU43.4s, vW31.4s\n"
-        "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
-        "fmla vV31.4s, vU43.4s, vW23.4s\n"
-        "fmla vV32.4s, vU43.4s, vW22.4s\n"
-        "fmla vV33.4s, vU43.4s, vW21.4s\n"
-        "fmul vV41.4s, vU43.4s, vW13.4s\n"
-        "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
-        "fmla vV42.4s, vU43.4s, vW12.4s\n"
-        "fmla vV43.4s, vU43.4s, vW11.4s\n"
-        "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
-        "fmla vV31.4s, vU53.4s, vW33.4s\n"
-        "fmla vV32.4s, vU53.4s, vW32.4s\n"
-        "fmla vV33.4s, vU53.4s, vW31.4s\n"
-        "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
-        "fmla vV41.4s, vU53.4s, vW23.4s\n"
-        "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
-        "fmla vV42.4s, vU53.4s, vW22.4s\n"
-        "fmla vV43.4s, vU53.4s, vW21.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV41.4s, vU63.4s, vW33.4s\n"
-        "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
-        "fmla vV42.4s, vU63.4s, vW32.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %[prftch]]\n"
-        "fmla vV43.4s, vU63.4s, vW31.4s\n"
-        "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV12.4s, vU12.4s, vW11.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV12.4s, vU22.4s, vW21.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV21.4s, vU22.4s, vW12.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV22.4s, vU22.4s, vW11.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride4] ]\n"
-        "fmla vV12.4s, vU32.4s, vW31.4s\n"
-        "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
-        "fmla vV21.4s, vU32.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV22.4s, vU32.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr1 , %[prftch]]\n"
-        "fmla vV31.4s, vU32.4s, vW12.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride1]]\n"
-        "fmla vV32.4s, vU32.4s, vW11.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride2] ]\n"
-        "fmla vV22.4s, vU42.4s, vW31.4s\n"
-        "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
-        "fmla vV31.4s, vU42.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride3] ]\n"
-        "fmla vV32.4s, vU42.4s, vW21.4s\n"
-        "subs %x[c4_rem], %x[c4_rem], #1\n"
-        "fmla vV41.4s, vU42.4s, vW12.4s\n"
-        "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
-        "fmla vV42.4s, vU42.4s, vW11.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV31.4s, vU52.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride4] ]\n"
-        "fmla vV32.4s, vU52.4s, vW31.4s\n"
-        "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
-        "fmla vV41.4s, vU52.4s, vW22.4s\n"
-        "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
-        "fmla vV42.4s, vU52.4s, vW21.4s\n"
-        "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
-        "fmla vV41.4s, vU62.4s, vW32.4s\n"
-        "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
-        "fmla vV42.4s, vU62.4s, vW31.4s\n"
-        "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
-        "fmla vV21.4s, vU21.4s, vW11.4s\n"
-        "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU31.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride5] ]\n"
-        "fmla vV31.4s, vU31.4s, vW11.4s\n"
-        "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
-        "fmla vV21.4s, vU41.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU41.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr2 , %[prftch]]\n"
-        "fmla vV41.4s, vU41.4s, vW11.4s\n"
-        "ldr qW11, [%x[wptr0]], #0x10\n"
-        "fmla vV31.4s, vU51.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-        "fmla vV41.4s, vU51.4s, vW21.4s\n"
-        "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
-        "fmla vV41.4s, vU61.4s, vW31.4s\n"
-        "str qV41, [vptr3], #0x10\n"
-        "fmul vV14.4s, vU16.4s, vW13.4s\n"
-        "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
-        "fmul vV13.4s, vU15.4s, vW13.4s\n"
-        "ldr qW31, [wptr2], #0x10\n"
-        "fmla vV14.4s, vU15.4s, vW12.4s\n"
-        "ldr qW21, [wptr1], #0x10\n"
-        "fmul vV12.4s, vU14.4s, vW13.4s\n"
-        "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
-        "fmla vV13.4s, vU14.4s, vW12.4s\n"
-        "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
-        "fmla vV14.4s, vU14.4s, vW11.4s\n"
-        "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
-        "fmla vV14.4s, vU26.4s, vW23.4s\n"
-        "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
-        "fmul vV24.4s, vU26.4s, vW13.4s\n"
-        "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
-        "fmla vV13.4s, vU25.4s, vW23.4s\n"
-        "bne 1b\n"
-
-      "2:"  // Final iteration
-        "fmla vV14.4s, vU25.4s, vW22.4s\n"
-        "fmul vV23.4s, vU25.4s, vW13.4s\n"
-        "fmla vV24.4s, vU25.4s, vW12.4s\n"
-        "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
-        "fmla vV12.4s, vU24.4s, vW23.4s\n"
-        "fmla vV13.4s, vU24.4s, vW22.4s\n"
-        "fmla vV14.4s, vU24.4s, vW21.4s\n"
-        "fmul vV22.4s, vU24.4s, vW13.4s\n"
-        "fmla vV23.4s, vU24.4s, vW12.4s\n"
-        "fmla vV24.4s, vU24.4s, vW11.4s\n"
-        "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
-        "fmla vV14.4s, vU36.4s, vW33.4s\n"
-        "fmla vV24.4s, vU36.4s, vW23.4s\n"
-        "fmul vV34.4s, vU36.4s, vW13.4s\n"
-        "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
-        "fmla vV13.4s, vU35.4s, vW33.4s\n"
-        "fmla vV14.4s, vU35.4s, vW32.4s\n"
-        "fmla vV23.4s, vU35.4s, vW23.4s\n"
-        "fmla vV24.4s, vU35.4s, vW22.4s\n"
-        "fmul vV33.4s, vU35.4s, vW13.4s\n"
-        "fmla vV34.4s, vU35.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
-        "fmla vV12.4s, vU34.4s, vW33.4s\n"
-        "fmla vV13.4s, vU34.4s, vW32.4s\n"
-        "fmla vV14.4s, vU34.4s, vW31.4s\n"
-        "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
-        "fmla vV22.4s, vU34.4s, vW23.4s\n"
-        "fmla vV23.4s, vU34.4s, vW22.4s\n"
-        "fmla vV24.4s, vU34.4s, vW21.4s\n"
-        "fmul vV32.4s, vU34.4s, vW13.4s\n"
-        "fmla vV33.4s, vU34.4s, vW12.4s\n"
-        "fmla vV34.4s, vU34.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
-        "fmla vV24.4s, vU46.4s, vW33.4s\n"
-        "fmla vV34.4s, vU46.4s, vW23.4s\n"
-        "fmul vV44.4s, vU46.4s, vW13.4s\n"
-        "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
-        "fmla vV23.4s, vU45.4s, vW33.4s\n"
-        "fmla vV24.4s, vU45.4s, vW32.4s\n"
-        "fmla vV33.4s, vU45.4s, vW23.4s\n"
-        "fmla vV34.4s, vU45.4s, vW22.4s\n"
-        "fmul vV43.4s, vU45.4s, vW13.4s\n"
-        "fmla vV44.4s, vU45.4s, vW12.4s\n"
-        "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
-        "fmla vV22.4s, vU44.4s, vW33.4s\n"
-        "fmla vV23.4s, vU44.4s, vW32.4s\n"
-        "fmla vV24.4s, vU44.4s, vW31.4s\n"
-        "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
-        "fmla vV32.4s, vU44.4s, vW23.4s\n"
-        "fmla vV33.4s, vU44.4s, vW22.4s\n"
-        "fmla vV34.4s, vU44.4s, vW21.4s\n"
-        "fmul vV42.4s, vU44.4s, vW13.4s\n"
-        "fmla vV43.4s, vU44.4s, vW12.4s\n"
-        "fmla vV44.4s, vU44.4s, vW11.4s\n"
-        "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
-        "fmla vV34.4s, vU56.4s, vW33.4s\n"
-        "fmla vV44.4s, vU56.4s, vW23.4s\n"
-        "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
-        "fmla vV33.4s, vU55.4s, vW33.4s\n"
-        "fmla vV34.4s, vU55.4s, vW32.4s\n"
-        "fmla vV43.4s, vU55.4s, vW23.4s\n"
-        "fmla vV44.4s, vU55.4s, vW22.4s\n"
-        "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
-        "fmla vV32.4s, vU54.4s, vW33.4s\n"
-        "fmla vV33.4s, vU54.4s, vW32.4s\n"
-        "fmla vV34.4s, vU54.4s, vW31.4s\n"
-        "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
-        "fmla vV42.4s, vU54.4s, vW23.4s\n"
-        "fmla vV43.4s, vU54.4s, vW22.4s\n"
-        "fmla vV44.4s, vU54.4s, vW21.4s\n"
-        "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
-        "fmla vV44.4s, vU66.4s, vW33.4s\n"
-        "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
-        "fmla vV43.4s, vU65.4s, vW33.4s\n"
-        "fmla vV44.4s, vU65.4s, vW32.4s\n"
-        "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
-        "fmla vV42.4s, vU64.4s, vW33.4s\n"
-        "fmla vV43.4s, vU64.4s, vW32.4s\n"
-        "fmla vV44.4s, vU64.4s, vW31.4s\n"
-        "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
-        "fmla vV12.4s, vU13.4s, vW12.4s\n"
-        "fmla vV13.4s, vU13.4s, vW11.4s\n"
-        "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "fmla vV12.4s, vU23.4s, vW22.4s\n"
-        "fmla vV13.4s, vU23.4s, vW21.4s\n"
-        "fmul vV21.4s, vU23.4s, vW13.4s\n"
-        "fmla vV22.4s, vU23.4s, vW12.4s\n"
-        "fmla vV23.4s, vU23.4s, vW11.4s\n"
-        "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "fmla vV12.4s, vU33.4s, vW32.4s\n"
-        "fmla vV13.4s, vU33.4s, vW31.4s\n"
-        "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
-        "fmla vV21.4s, vU33.4s, vW23.4s\n"
-        "fmla vV22.4s, vU33.4s, vW22.4s\n"
-        "fmla vV23.4s, vU33.4s, vW21.4s\n"
-        "fmul vV31.4s, vU33.4s, vW13.4s\n"
-        "fmla vV32.4s, vU33.4s, vW12.4s\n"
-        "fmla vV33.4s, vU33.4s, vW11.4s\n"
-        "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
-        "fmla vV21.4s, vU43.4s, vW33.4s\n"
-        "fmla vV22.4s, vU43.4s, vW32.4s\n"
-        "fmla vV23.4s, vU43.4s, vW31.4s\n"
-        "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
-        "fmla vV31.4s, vU43.4s, vW23.4s\n"
-        "fmla vV32.4s, vU43.4s, vW22.4s\n"
-        "fmla vV33.4s, vU43.4s, vW21.4s\n"
-        "fmul vV41.4s, vU43.4s, vW13.4s\n"
-        "fmla vV42.4s, vU43.4s, vW12.4s\n"
-        "fmla vV43.4s, vU43.4s, vW11.4s\n"
-        "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
-        "fmla vV31.4s, vU53.4s, vW33.4s\n"
-        "fmla vV32.4s, vU53.4s, vW32.4s\n"
-        "fmla vV33.4s, vU53.4s, vW31.4s\n"
-        "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
-        "fmla vV41.4s, vU53.4s, vW23.4s\n"
-        "fmla vV42.4s, vU53.4s, vW22.4s\n"
-        "fmla vV43.4s, vU53.4s, vW21.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV41.4s, vU63.4s, vW33.4s\n"
-        "fmla vV42.4s, vU63.4s, vW32.4s\n"
-        "fmla vV43.4s, vU63.4s, vW31.4s\n"
-        "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV12.4s, vU12.4s, vW11.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "fmla vV12.4s, vU22.4s, vW21.4s\n"
-        "fmla vV21.4s, vU22.4s, vW12.4s\n"
-        "fmla vV22.4s, vU22.4s, vW11.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "fmla vV12.4s, vU32.4s, vW31.4s\n"
-        "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
-        "fmla vV21.4s, vU32.4s, vW22.4s\n"
-        "fmla vV22.4s, vU32.4s, vW21.4s\n"
-        "fmla vV31.4s, vU32.4s, vW12.4s\n"
-        "fmla vV32.4s, vU32.4s, vW11.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW32.4s\n"
-        "fmla vV22.4s, vU42.4s, vW31.4s\n"
-        "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
-        "fmla vV31.4s, vU42.4s, vW22.4s\n"
-        "fmla vV32.4s, vU42.4s, vW21.4s\n"
-        "subs %x[c4_rem], %x[c4_rem], #1\n"
-        "fmla vV41.4s, vU42.4s, vW12.4s\n"
-        "fmla vV42.4s, vU42.4s, vW11.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV31.4s, vU52.4s, vW32.4s\n"
-        "fmla vV32.4s, vU52.4s, vW31.4s\n"
-        "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
-        "fmla vV41.4s, vU52.4s, vW22.4s\n"
-        "fmla vV42.4s, vU52.4s, vW21.4s\n"
-        "fmla vV41.4s, vU62.4s, vW32.4s\n"
-        "fmla vV42.4s, vU62.4s, vW31.4s\n"
-        "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "fmla vV21.4s, vU21.4s, vW11.4s\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU31.4s, vW21.4s\n"
-        "fmla vV31.4s, vU31.4s, vW11.4s\n"
-        "fmla vV21.4s, vU41.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU41.4s, vW21.4s\n"
-        "fmla vV41.4s, vU41.4s, vW11.4s\n"
-        "fmla vV31.4s, vU51.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-        "fmla vV41.4s, vU51.4s, vW21.4s\n"
-        "fmla vV41.4s, vU61.4s, vW31.4s\n"
-        "str qV41, [vptr3], #0x10\n"
-
-      ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
-      ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
-      ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
-      ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
-      ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
-      ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
-      ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
-      ".unreq qV22\n" ".unreq qU14\n"
-      ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
-      ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
-      ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
-      ".unreq qW33\n"
-      ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
-      ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
-      ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
-      ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
-      ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
-      ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
-      ".unreq qU53\n" ".unreq vW22\n"
-      ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
-      ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
-      ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
-      ".unreq vV12\n" ".unreq vU61\n"
-      ".unreq vU26\n" ".unreq vV32\n"
-      ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
-      ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
-      ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
-      ".unreq vV22\n" ".unreq vU14\n"
-      ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
-      ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
-      ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
-      ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
-      ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
-      ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
-      ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
-      ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
-      ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
-      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
-        [c4_rem] "+r" (c4_rem)
-      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
-        [v_row_stride] "r" (out_row_stride * sizeof(float)),
-        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
-        [uvw_col_stride1] "r" (1 * in_col_stride * sizeof(float)),
-        [uvw_col_stride2] "r" (2 * in_col_stride * sizeof(float)),
-        [uvw_col_stride3] "r" (3 * in_col_stride * sizeof(float)),
-        [uvw_col_stride4] "r" (4 * in_col_stride * sizeof(float)),
-        [uvw_col_stride5] "r" (5 * in_col_stride * sizeof(float)),
-        [prftch] "i" (prefetch_depth * sizeof(float)),
-        [prftch_uvw_col_stride1] "r" ((prefetch_depth + 1 * in_col_stride) * sizeof(float)),
-        [prftch_uvw_col_stride2] "r" ((prefetch_depth + 2 * in_col_stride) * sizeof(float)),
-        [prftch_uvw_col_stride3] "r" ((prefetch_depth + 3 * in_col_stride) * sizeof(float)),
-        [prftch_uvw_col_stride4] "r" ((prefetch_depth + 4 * in_col_stride) * sizeof(float)),
-        [prftch_uvw_col_stride5] "r" ((prefetch_depth + 5 * in_col_stride) * sizeof(float))
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
-        "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-    );
-  }
-  else if (channels_remaining >= 4)
-  {
-    int c4_rem = channels_remaining / 4;
-    channels_remaining %= 4;
-
-    asm volatile (
-      "qW22 .req q0\n" "vW22 .req v0\n"
-      "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
-      "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
-      "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
-      "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
-      "qW21 .req q3\n" "vW21 .req v3\n"
-      "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
-      "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
-      "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
-      "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
-      "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
-      "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
-      "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
-      "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
-      "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
-      "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
-      "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
-      "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
-      "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
-      "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
-      "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
-      "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
-      "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
-      "qW33 .req q16\n" "vW33 .req v16\n"
-      "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
-      "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
-      "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
-      "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
-      "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
-      "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
-      "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
-      "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
-      "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
-      "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
-      "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
-      "qU23 .req q28\n" "qU52 .req q28\n"
-      "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
-
-      "uptr1 .req x0\n"
-      "uptr2 .req x1\n"
-      "uptr3 .req x2\n"
-      "uptr4 .req x3\n"
-      "uptr5 .req x4\n"
-
-      "vptr1 .req x5\n"
-      "vptr2 .req x6\n"
-      "vptr3 .req x7\n"
-
-      "wptr1 .req x8\n"
-      "wptr2 .req x9\n"
-
-      "u_col_stride2 .req x10\n"
-      "u_col_stride3 .req x11\n"
-      "u_col_stride4 .req x12\n"
-      "u_col_stride5 .req x13\n"
-
-      "v_col_stride2 .req x14\n"
-      "v_col_stride3 .req x15\n"
-
-      "w_col_stride2 .req x16\n"
-
-      // Prepare pointers and strides
-      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
-      "add uptr2,    uptr1 , %x[u_row_stride]\n"
-      "add uptr3,    uptr2 , %x[u_row_stride]\n"
-      "add uptr4,    uptr3 , %x[u_row_stride]\n"
-      "add uptr5,    uptr4 , %x[u_row_stride]\n"
-
-      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
-      "add vptr2,    vptr1 , %x[v_row_stride]\n"
-      "add vptr3,    vptr2 , %x[v_row_stride]\n"
-
-      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
-      "add wptr2,    wptr1 , %x[w_row_stride]\n"
-
-      "add u_col_stride2, %x[u_col_stride1], %x[u_col_stride1]\n"
-      "add u_col_stride3,    u_col_stride2 , %x[u_col_stride1]\n"
-      "add u_col_stride4,    u_col_stride3 , %x[u_col_stride1]\n"
-      "add u_col_stride5,    u_col_stride4 , %x[u_col_stride1]\n"
-
-      "add v_col_stride2, %x[v_col_stride1], %x[v_col_stride1]\n"
-      "add v_col_stride3,    v_col_stride2 , %x[v_col_stride1]\n"
-
-      "add w_col_stride2, %x[w_col_stride1], %x[w_col_stride1]\n"
-
-      // Load initial operands
-      "ldr qU16, [%x[uptr0], u_col_stride5]\n"
-      "ldr qW13, [%x[wptr0], w_col_stride2]\n"
-      "subs %x[c4_rem], %x[c4_rem], #1\n"
-      "ldr qU15, [%x[uptr0], u_col_stride4]\n"
-      "ldr qW23, [wptr1, w_col_stride2]\n"
-      "ldr qU14, [%x[uptr0], u_col_stride3]\n"
-      "ldr qW33, [wptr2, w_col_stride2]\n"
-      "ldr qU26, [uptr1, u_col_stride5]\n"
-      "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
-      "ldr qU25, [uptr1, u_col_stride4]\n"
-      "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
-      "ldr qU36, [uptr2, u_col_stride5]\n"
-      "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
-      "ldr qW11, [%x[wptr0]], #0x10\n"
-      "fmul vV14.4s, vU16.4s, vW13.4s\n"
-      "ldr qU24, [uptr1, u_col_stride3]\n"
-      "fmul vV13.4s, vU15.4s, vW13.4s\n"
-      "ldr qW31, [wptr2], #0x10\n"
-      "fmla vV14.4s, vU15.4s, vW12.4s\n"
-      "ldr qW21, [wptr1], #0x10\n"
-      "fmul vV12.4s, vU14.4s, vW13.4s\n"
-      "ldr qU34, [uptr2, u_col_stride3]\n"
-      "fmla vV13.4s, vU14.4s, vW12.4s\n"
-      "ldr qU46, [uptr3, u_col_stride5]\n"
-      "fmla vV14.4s, vU14.4s, vW11.4s\n"
-      "ldr qU45, [uptr3, u_col_stride4]\n"
-      "fmla vV14.4s, vU26.4s, vW23.4s\n"
-      "ldr qU35, [uptr2, u_col_stride4]\n"
-      "fmul vV24.4s, vU26.4s, vW13.4s\n"
-      "ldr qU44, [uptr3, u_col_stride3]\n"
-      "fmla vV13.4s, vU25.4s, vW23.4s\n"
-      "beq 2f\n"  // Single iteration only
-
-      "1:"  // Loop body
-        "fmla vV14.4s, vU25.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[wptr0]]\n"
-        "fmul vV23.4s, vU25.4s, vW13.4s\n"
-        "prfm pldl1keep, [%x[wptr0], %x[w_col_stride1]]\n"
-        "fmla vV24.4s, vU25.4s, vW12.4s\n"
-        "ldr qU56, [uptr4, u_col_stride5]\n"
-        "fmla vV12.4s, vU24.4s, vW23.4s\n"
-        "prfm pldl1keep, [%x[wptr0],    w_col_stride2 ]\n"
-        "fmla vV13.4s, vU24.4s, vW22.4s\n"
-        "prfm pldl1keep, [   wptr1 ]\n"
-        "fmla vV14.4s, vU24.4s, vW21.4s\n"
-        "prfm pldl1keep, [   wptr1 , %x[w_col_stride1]]\n"
-        "fmul vV22.4s, vU24.4s, vW13.4s\n"
-        "prfm pldl1keep, [   wptr1 ,    w_col_stride2 ]\n"
-        "fmla vV23.4s, vU24.4s, vW12.4s\n"
-        "prfm pldl1keep, [   wptr2 ]\n"
-        "fmla vV24.4s, vU24.4s, vW11.4s\n"
-        "ldr qU55, [uptr4, u_col_stride4]\n"
-        "fmla vV14.4s, vU36.4s, vW33.4s\n"
-        "prfm pldl1keep, [   wptr2 , %x[w_col_stride1]]\n"
-        "fmla vV24.4s, vU36.4s, vW23.4s\n"
-        "prfm pldl1keep, [   wptr2 ,    w_col_stride2 ]\n"
-        "fmul vV34.4s, vU36.4s, vW13.4s\n"
-        "ldr qU54, [uptr4, u_col_stride3]\n"
-        "fmla vV13.4s, vU35.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr2 , %x[u_col_stride1]]\n"
-        "fmla vV14.4s, vU35.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr2 ,    u_col_stride2 ]\n"
-        "fmla vV23.4s, vU35.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr2 ,    u_col_stride3 ]\n"
-        "fmla vV24.4s, vU35.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr2 ,    u_col_stride4 ]\n"
-        "fmul vV33.4s, vU35.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr2 ,    u_col_stride5 ]\n"
-        "fmla vV34.4s, vU35.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, u_col_stride5]\n"
-        "fmla vV12.4s, vU34.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr3 ]\n"
-        "fmla vV13.4s, vU34.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr3 , %x[u_col_stride1]]\n"
-        "fmla vV14.4s, vU34.4s, vW31.4s\n"
-        "str qV14, [%x[vptr0], v_col_stride3]\n"
-        "fmla vV22.4s, vU34.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr3 ,    u_col_stride2 ]\n"
-        "fmla vV23.4s, vU34.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr3 ,    u_col_stride3 ]\n"
-        "fmla vV24.4s, vU34.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr3 ,    u_col_stride4 ]\n"
-        "fmul vV32.4s, vU34.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr3 ,    u_col_stride5 ]\n"
-        "fmla vV33.4s, vU34.4s, vW12.4s\n"
-        "prfm pldl1keep, [   uptr4 ]\n"
-        "fmla vV34.4s, vU34.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, u_col_stride4]\n"
-        "fmla vV24.4s, vU46.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr4 , %x[u_col_stride1]]\n"
-        "fmla vV34.4s, vU46.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr4 ,    u_col_stride2 ]\n"
-        "fmul vV44.4s, vU46.4s, vW13.4s\n"
-        "ldr qU64, [uptr5, u_col_stride3]\n"
-        "fmla vV23.4s, vU45.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr4 ,    u_col_stride3 ]\n"
-        "fmla vV24.4s, vU45.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr4 ,    u_col_stride4 ]\n"
-        "fmla vV33.4s, vU45.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr4 ,    u_col_stride5 ]\n"
-        "fmla vV34.4s, vU45.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr5 ]\n"
-        "fmul vV43.4s, vU45.4s, vW13.4s\n"
-        "prfm pldl1keep, [   uptr5 , %x[u_col_stride1]]\n"
-        "fmla vV44.4s, vU45.4s, vW12.4s\n"
-        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV22.4s, vU44.4s, vW33.4s\n"
-        "prfm pldl1keep, [   uptr5 ,    u_col_stride2 ]\n"
-        "fmla vV23.4s, vU44.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr5 ,    u_col_stride3 ]\n"
-        "fmla vV24.4s, vU44.4s, vW31.4s\n"
-        "str qV24, [vptr1, v_col_stride3]\n"
-        "fmla vV32.4s, vU44.4s, vW23.4s\n"
-        "prfm pldl1keep, [   uptr5 ,    u_col_stride4 ]\n"
-        "fmla vV33.4s, vU44.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr5 ,    u_col_stride5 ]\n"
-        "fmla vV34.4s, vU44.4s, vW21.4s\n"
-        "prfm pstl1keep, [%x[vptr0]]\n"
-        "fmul vV42.4s, vU44.4s, vW13.4s\n"
-        "prfm pstl1keep, [%x[vptr0], %x[v_col_stride1]]\n"
-        "fmla vV43.4s, vU44.4s, vW12.4s\n"
-        "prfm pstl1keep, [%x[vptr0],    v_col_stride2 ]\n"
-        "fmla vV44.4s, vU44.4s, vW11.4s\n"
-        "ldr qU23, [uptr1, u_col_stride2]\n"
-        "fmla vV34.4s, vU56.4s, vW33.4s\n"
-        "prfm pstl1keep, [%x[vptr0],    v_col_stride3 ]\n"
-        "fmla vV44.4s, vU56.4s, vW23.4s\n"
-        "ldr qU33, [uptr2, u_col_stride2]\n"
-        "fmla vV33.4s, vU55.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr1 ]\n"
-        "fmla vV34.4s, vU55.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr1 , %x[v_col_stride1]]\n"
-        "fmla vV43.4s, vU55.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr1 ,    v_col_stride2 ]\n"
-        "fmla vV44.4s, vU55.4s, vW22.4s\n"
-        "ldr qU43, [uptr3, u_col_stride2]\n"
-        "fmla vV32.4s, vU54.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr1 ,    v_col_stride3 ]\n"
-        "fmla vV33.4s, vU54.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr2 ]\n"
-        "fmla vV34.4s, vU54.4s, vW31.4s\n"
-        "str qV34, [vptr2, v_col_stride3]\n"
-        "fmla vV42.4s, vU54.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr2 , %x[v_col_stride1]]\n"
-        "fmla vV43.4s, vU54.4s, vW22.4s\n"
-        "prfm pstl1keep, [   vptr2 ,    v_col_stride2 ]\n"
-        "fmla vV44.4s, vU54.4s, vW21.4s\n"
-        "ldr qU53, [uptr4, u_col_stride2]\n"
-        "fmla vV44.4s, vU66.4s, vW33.4s\n"
-        "ldr qU63, [uptr5, u_col_stride2]\n"
-        "fmla vV43.4s, vU65.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr2 ,    v_col_stride3 ]\n"
-        "fmla vV44.4s, vU65.4s, vW32.4s\n"
-        "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
-        "fmla vV42.4s, vU64.4s, vW33.4s\n"
-        "prfm pstl1keep, [   vptr3 ]\n"
-        "fmla vV43.4s, vU64.4s, vW32.4s\n"
-        "prfm pstl1keep, [   vptr3 , %x[v_col_stride1]]\n"
-        "fmla vV44.4s, vU64.4s, vW31.4s\n"
-        "str qV44, [vptr3, v_col_stride3]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
-        "fmla vV12.4s, vU13.4s, vW12.4s\n"
-        "prfm pstl1keep, [   vptr3 ,    v_col_stride2 ]\n"
-        "fmla vV13.4s, vU13.4s, vW11.4s\n"
-        "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "prfm pstl1keep, [   vptr3 ,    v_col_stride3 ]\n"
-        "fmla vV12.4s, vU23.4s, vW22.4s\n"
-        "fmla vV13.4s, vU23.4s, vW21.4s\n"
-        "fmul vV21.4s, vU23.4s, vW13.4s\n"
-        "fmla vV22.4s, vU23.4s, vW12.4s\n"
-        "fmla vV23.4s, vU23.4s, vW11.4s\n"
-        "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "fmla vV12.4s, vU33.4s, vW32.4s\n"
-        "fmla vV13.4s, vU33.4s, vW31.4s\n"
-        "str qV13, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21.4s, vU33.4s, vW23.4s\n"
-        "fmla vV22.4s, vU33.4s, vW22.4s\n"
-        "fmla vV23.4s, vU33.4s, vW21.4s\n"
-        "fmul vV31.4s, vU33.4s, vW13.4s\n"
-        "fmla vV32.4s, vU33.4s, vW12.4s\n"
-        "fmla vV33.4s, vU33.4s, vW11.4s\n"
-        "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
-        "fmla vV21.4s, vU43.4s, vW33.4s\n"
-        "fmla vV22.4s, vU43.4s, vW32.4s\n"
-        "fmla vV23.4s, vU43.4s, vW31.4s\n"
-        "str qV23, [vptr1, v_col_stride2]\n"
-        "fmla vV31.4s, vU43.4s, vW23.4s\n"
-        "fmla vV32.4s, vU43.4s, vW22.4s\n"
-        "fmla vV33.4s, vU43.4s, vW21.4s\n"
-        "fmul vV41.4s, vU43.4s, vW13.4s\n"
-        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
-        "fmla vV42.4s, vU43.4s, vW12.4s\n"
-        "fmla vV43.4s, vU43.4s, vW11.4s\n"
-        "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
-        "fmla vV31.4s, vU53.4s, vW33.4s\n"
-        "fmla vV32.4s, vU53.4s, vW32.4s\n"
-        "fmla vV33.4s, vU53.4s, vW31.4s\n"
-        "str qV33, [vptr2, v_col_stride2]\n"
-        "fmla vV41.4s, vU53.4s, vW23.4s\n"
-        "ldr qW23, [wptr1, w_col_stride2]\n"
-        "fmla vV42.4s, vU53.4s, vW22.4s\n"
-        "fmla vV43.4s, vU53.4s, vW21.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV41.4s, vU63.4s, vW33.4s\n"
-        "ldr qW33, [wptr2, w_col_stride2]\n"
-        "fmla vV42.4s, vU63.4s, vW32.4s\n"
-        "prfm pldl1keep, [%x[uptr0]]\n"
-        "fmla vV43.4s, vU63.4s, vW31.4s\n"
-        "str qV43, [vptr3, v_col_stride2]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV12.4s, vU12.4s, vW11.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[uptr0], %x[u_col_stride1]]\n"
-        "fmla vV12.4s, vU22.4s, vW21.4s\n"
-        "prfm pldl1keep, [%x[uptr0],    u_col_stride2 ]\n"
-        "fmla vV21.4s, vU22.4s, vW12.4s\n"
-        "prfm pldl1keep, [%x[uptr0],    u_col_stride3 ]\n"
-        "fmla vV22.4s, vU22.4s, vW11.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "prfm pldl1keep, [%x[uptr0],    u_col_stride4 ]\n"
-        "fmla vV12.4s, vU32.4s, vW31.4s\n"
-        "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
-        "fmla vV21.4s, vU32.4s, vW22.4s\n"
-        "prfm pldl1keep, [%x[uptr0],    u_col_stride5 ]\n"
-        "fmla vV22.4s, vU32.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr1 ]\n"
-        "fmla vV31.4s, vU32.4s, vW12.4s\n"
-        "prfm pldl1keep, [   uptr1 , %x[u_col_stride1]]\n"
-        "fmla vV32.4s, vU32.4s, vW11.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr1 ,    u_col_stride2 ]\n"
-        "fmla vV22.4s, vU42.4s, vW31.4s\n"
-        "str qV22, [vptr1, %x[v_col_stride1]]\n"
-        "fmla vV31.4s, vU42.4s, vW22.4s\n"
-        "prfm pldl1keep, [   uptr1 ,    u_col_stride3 ]\n"
-        "fmla vV32.4s, vU42.4s, vW21.4s\n"
-        "subs %x[c4_rem], %x[c4_rem], #1\n"
-        "fmla vV41.4s, vU42.4s, vW12.4s\n"
-        "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
-        "fmla vV42.4s, vU42.4s, vW11.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV31.4s, vU52.4s, vW32.4s\n"
-        "prfm pldl1keep, [   uptr1 ,    u_col_stride4 ]\n"
-        "fmla vV32.4s, vU52.4s, vW31.4s\n"
-        "str qV32, [vptr2, %x[v_col_stride1]]\n"
-        "fmla vV41.4s, vU52.4s, vW22.4s\n"
-        "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
-        "fmla vV42.4s, vU52.4s, vW21.4s\n"
-        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
-        "fmla vV41.4s, vU62.4s, vW32.4s\n"
-        "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
-        "fmla vV42.4s, vU62.4s, vW31.4s\n"
-        "str qV42, [vptr3, %x[v_col_stride1]]\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
-        "fmla vV21.4s, vU21.4s, vW11.4s\n"
-        "ldr qU26, [uptr1, u_col_stride5]\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU31.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr1 ,    u_col_stride5 ]\n"
-        "fmla vV31.4s, vU31.4s, vW11.4s\n"
-        "ldr qU25, [uptr1, u_col_stride4]\n"
-        "fmla vV21.4s, vU41.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU41.4s, vW21.4s\n"
-        "prfm pldl1keep, [   uptr2 ]\n"
-        "fmla vV41.4s, vU41.4s, vW11.4s\n"
-        "ldr qW11, [%x[wptr0]], #0x10\n"
-        "fmla vV31.4s, vU51.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-        "fmla vV41.4s, vU51.4s, vW21.4s\n"
-        "ldr qU36, [uptr2, u_col_stride5]\n"
-        "fmla vV41.4s, vU61.4s, vW31.4s\n"
-        "str qV41, [vptr3], #0x10\n"
-        "fmul vV14.4s, vU16.4s, vW13.4s\n"
-        "ldr qU24, [uptr1, u_col_stride3]\n"
-        "fmul vV13.4s, vU15.4s, vW13.4s\n"
-        "ldr qW31, [wptr2], #0x10\n"
-        "fmla vV14.4s, vU15.4s, vW12.4s\n"
-        "ldr qW21, [wptr1], #0x10\n"
-        "fmul vV12.4s, vU14.4s, vW13.4s\n"
-        "ldr qU34, [uptr2, u_col_stride3]\n"
-        "fmla vV13.4s, vU14.4s, vW12.4s\n"
-        "ldr qU46, [uptr3, u_col_stride5]\n"
-        "fmla vV14.4s, vU14.4s, vW11.4s\n"
-        "ldr qU45, [uptr3, u_col_stride4]\n"
-        "fmla vV14.4s, vU26.4s, vW23.4s\n"
-        "ldr qU35, [uptr2, u_col_stride4]\n"
-        "fmul vV24.4s, vU26.4s, vW13.4s\n"
-        "ldr qU44, [uptr3, u_col_stride3]\n"
-        "fmla vV13.4s, vU25.4s, vW23.4s\n"
-        "bne 1b\n"
-
-      "2:"  // Final iteration
-        "fmla vV14.4s, vU25.4s, vW22.4s\n"
-        "fmul vV23.4s, vU25.4s, vW13.4s\n"
-        "fmla vV24.4s, vU25.4s, vW12.4s\n"
-        "ldr qU56, [uptr4, u_col_stride5]\n"
-        "fmla vV12.4s, vU24.4s, vW23.4s\n"
-        "fmla vV13.4s, vU24.4s, vW22.4s\n"
-        "fmla vV14.4s, vU24.4s, vW21.4s\n"
-        "fmul vV22.4s, vU24.4s, vW13.4s\n"
-        "fmla vV23.4s, vU24.4s, vW12.4s\n"
-        "fmla vV24.4s, vU24.4s, vW11.4s\n"
-        "ldr qU55, [uptr4, u_col_stride4]\n"
-        "fmla vV14.4s, vU36.4s, vW33.4s\n"
-        "fmla vV24.4s, vU36.4s, vW23.4s\n"
-        "fmul vV34.4s, vU36.4s, vW13.4s\n"
-        "ldr qU54, [uptr4, u_col_stride3]\n"
-        "fmla vV13.4s, vU35.4s, vW33.4s\n"
-        "fmla vV14.4s, vU35.4s, vW32.4s\n"
-        "fmla vV23.4s, vU35.4s, vW23.4s\n"
-        "fmla vV24.4s, vU35.4s, vW22.4s\n"
-        "fmul vV33.4s, vU35.4s, vW13.4s\n"
-        "fmla vV34.4s, vU35.4s, vW12.4s\n"
-        "ldr qU66, [uptr5, u_col_stride5]\n"
-        "fmla vV12.4s, vU34.4s, vW33.4s\n"
-        "fmla vV13.4s, vU34.4s, vW32.4s\n"
-        "fmla vV14.4s, vU34.4s, vW31.4s\n"
-        "str qV14, [%x[vptr0], v_col_stride3]\n"
-        "fmla vV22.4s, vU34.4s, vW23.4s\n"
-        "fmla vV23.4s, vU34.4s, vW22.4s\n"
-        "fmla vV24.4s, vU34.4s, vW21.4s\n"
-        "fmul vV32.4s, vU34.4s, vW13.4s\n"
-        "fmla vV33.4s, vU34.4s, vW12.4s\n"
-        "fmla vV34.4s, vU34.4s, vW11.4s\n"
-        "ldr qU65, [uptr5, u_col_stride4]\n"
-        "fmla vV24.4s, vU46.4s, vW33.4s\n"
-        "fmla vV34.4s, vU46.4s, vW23.4s\n"
-        "fmul vV44.4s, vU46.4s, vW13.4s\n"
-        "ldr qU64, [uptr5, u_col_stride3]\n"
-        "fmla vV23.4s, vU45.4s, vW33.4s\n"
-        "fmla vV24.4s, vU45.4s, vW32.4s\n"
-        "fmla vV33.4s, vU45.4s, vW23.4s\n"
-        "fmla vV34.4s, vU45.4s, vW22.4s\n"
-        "fmul vV43.4s, vU45.4s, vW13.4s\n"
-        "fmla vV44.4s, vU45.4s, vW12.4s\n"
-        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
-        "fmla vV22.4s, vU44.4s, vW33.4s\n"
-        "fmla vV23.4s, vU44.4s, vW32.4s\n"
-        "fmla vV24.4s, vU44.4s, vW31.4s\n"
-        "str qV24, [vptr1, v_col_stride3]\n"
-        "fmla vV32.4s, vU44.4s, vW23.4s\n"
-        "fmla vV33.4s, vU44.4s, vW22.4s\n"
-        "fmla vV34.4s, vU44.4s, vW21.4s\n"
-        "fmul vV42.4s, vU44.4s, vW13.4s\n"
-        "fmla vV43.4s, vU44.4s, vW12.4s\n"
-        "fmla vV44.4s, vU44.4s, vW11.4s\n"
-        "ldr qU23, [uptr1, u_col_stride2]\n"
-        "fmla vV34.4s, vU56.4s, vW33.4s\n"
-        "fmla vV44.4s, vU56.4s, vW23.4s\n"
-        "ldr qU33, [uptr2, u_col_stride2]\n"
-        "fmla vV33.4s, vU55.4s, vW33.4s\n"
-        "fmla vV34.4s, vU55.4s, vW32.4s\n"
-        "fmla vV43.4s, vU55.4s, vW23.4s\n"
-        "fmla vV44.4s, vU55.4s, vW22.4s\n"
-        "ldr qU43, [uptr3, u_col_stride2]\n"
-        "fmla vV32.4s, vU54.4s, vW33.4s\n"
-        "fmla vV33.4s, vU54.4s, vW32.4s\n"
-        "fmla vV34.4s, vU54.4s, vW31.4s\n"
-        "str qV34, [vptr2, v_col_stride3]\n"
-        "fmla vV42.4s, vU54.4s, vW23.4s\n"
-        "fmla vV43.4s, vU54.4s, vW22.4s\n"
-        "fmla vV44.4s, vU54.4s, vW21.4s\n"
-        "ldr qU53, [uptr4, u_col_stride2]\n"
-        "fmla vV44.4s, vU66.4s, vW33.4s\n"
-        "ldr qU63, [uptr5, u_col_stride2]\n"
-        "fmla vV43.4s, vU65.4s, vW33.4s\n"
-        "fmla vV44.4s, vU65.4s, vW32.4s\n"
-        "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
-        "fmla vV42.4s, vU64.4s, vW33.4s\n"
-        "fmla vV43.4s, vU64.4s, vW32.4s\n"
-        "fmla vV44.4s, vU64.4s, vW31.4s\n"
-        "str qV44, [vptr3, v_col_stride3]\n"
-        "fmul vV11.4s, vU13.4s, vW13.4s\n"
-        "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
-        "fmla vV12.4s, vU13.4s, vW12.4s\n"
-        "fmla vV13.4s, vU13.4s, vW11.4s\n"
-        "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
-        "fmla vV11.4s, vU23.4s, vW23.4s\n"
-        "fmla vV12.4s, vU23.4s, vW22.4s\n"
-        "fmla vV13.4s, vU23.4s, vW21.4s\n"
-        "fmul vV21.4s, vU23.4s, vW13.4s\n"
-        "fmla vV22.4s, vU23.4s, vW12.4s\n"
-        "fmla vV23.4s, vU23.4s, vW11.4s\n"
-        "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
-        "fmla vV11.4s, vU33.4s, vW33.4s\n"
-        "fmla vV12.4s, vU33.4s, vW32.4s\n"
-        "fmla vV13.4s, vU33.4s, vW31.4s\n"
-        "str qV13, [%x[vptr0], v_col_stride2]\n"
-        "fmla vV21.4s, vU33.4s, vW23.4s\n"
-        "fmla vV22.4s, vU33.4s, vW22.4s\n"
-        "fmla vV23.4s, vU33.4s, vW21.4s\n"
-        "fmul vV31.4s, vU33.4s, vW13.4s\n"
-        "fmla vV32.4s, vU33.4s, vW12.4s\n"
-        "fmla vV33.4s, vU33.4s, vW11.4s\n"
-        "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
-        "fmla vV21.4s, vU43.4s, vW33.4s\n"
-        "fmla vV22.4s, vU43.4s, vW32.4s\n"
-        "fmla vV23.4s, vU43.4s, vW31.4s\n"
-        "str qV23, [vptr1, v_col_stride2]\n"
-        "fmla vV31.4s, vU43.4s, vW23.4s\n"
-        "fmla vV32.4s, vU43.4s, vW22.4s\n"
-        "fmla vV33.4s, vU43.4s, vW21.4s\n"
-        "fmul vV41.4s, vU43.4s, vW13.4s\n"
-        "fmla vV42.4s, vU43.4s, vW12.4s\n"
-        "fmla vV43.4s, vU43.4s, vW11.4s\n"
-        "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
-        "fmla vV31.4s, vU53.4s, vW33.4s\n"
-        "fmla vV32.4s, vU53.4s, vW32.4s\n"
-        "fmla vV33.4s, vU53.4s, vW31.4s\n"
-        "str qV33, [vptr2, v_col_stride2]\n"
-        "fmla vV41.4s, vU53.4s, vW23.4s\n"
-        "fmla vV42.4s, vU53.4s, vW22.4s\n"
-        "fmla vV43.4s, vU53.4s, vW21.4s\n"
-        "ldr qU11, [%x[uptr0]], #0x10\n"
-        "fmla vV41.4s, vU63.4s, vW33.4s\n"
-        "fmla vV42.4s, vU63.4s, vW32.4s\n"
-        "fmla vV43.4s, vU63.4s, vW31.4s\n"
-        "str qV43, [vptr3, v_col_stride2]\n"
-        "fmla vV11.4s, vU12.4s, vW12.4s\n"
-        "ldr qU21, [uptr1], #0x10\n"
-        "fmla vV12.4s, vU12.4s, vW11.4s\n"
-        "ldr qU31, [uptr2], #0x10\n"
-        "fmla vV11.4s, vU22.4s, vW22.4s\n"
-        "fmla vV12.4s, vU22.4s, vW21.4s\n"
-        "fmla vV21.4s, vU22.4s, vW12.4s\n"
-        "fmla vV22.4s, vU22.4s, vW11.4s\n"
-        "ldr qU41, [uptr3], #0x10\n"
-        "fmla vV11.4s, vU32.4s, vW32.4s\n"
-        "fmla vV12.4s, vU32.4s, vW31.4s\n"
-        "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
-        "fmla vV21.4s, vU32.4s, vW22.4s\n"
-        "fmla vV22.4s, vU32.4s, vW21.4s\n"
-        "fmla vV31.4s, vU32.4s, vW12.4s\n"
-        "fmla vV32.4s, vU32.4s, vW11.4s\n"
-        "ldr qU51, [uptr4], #0x10\n"
-        "fmla vV21.4s, vU42.4s, vW32.4s\n"
-        "fmla vV22.4s, vU42.4s, vW31.4s\n"
-        "str qV22, [vptr1, %x[v_col_stride1]]\n"
-        "fmla vV31.4s, vU42.4s, vW22.4s\n"
-        "fmla vV32.4s, vU42.4s, vW21.4s\n"
-        "subs %x[c4_rem], %x[c4_rem], #1\n"
-        "fmla vV41.4s, vU42.4s, vW12.4s\n"
-        "fmla vV42.4s, vU42.4s, vW11.4s\n"
-        "ldr qU61, [uptr5], #0x10\n"
-        "fmla vV31.4s, vU52.4s, vW32.4s\n"
-        "fmla vV32.4s, vU52.4s, vW31.4s\n"
-        "str qV32, [vptr2, %x[v_col_stride1]]\n"
-        "fmla vV41.4s, vU52.4s, vW22.4s\n"
-        "fmla vV42.4s, vU52.4s, vW21.4s\n"
-        "fmla vV41.4s, vU62.4s, vW32.4s\n"
-        "fmla vV42.4s, vU62.4s, vW31.4s\n"
-        "str qV42, [vptr3, %x[v_col_stride1]]\n"
-        "fmla vV11.4s, vU11.4s, vW11.4s\n"
-        "fmla vV11.4s, vU21.4s, vW21.4s\n"
-        "fmla vV21.4s, vU21.4s, vW11.4s\n"
-        "fmla vV11.4s, vU31.4s, vW31.4s\n"
-        "str qV11, [%x[vptr0]], #0x10\n"
-        "fmla vV21.4s, vU31.4s, vW21.4s\n"
-        "fmla vV31.4s, vU31.4s, vW11.4s\n"
-        "fmla vV21.4s, vU41.4s, vW31.4s\n"
-        "str qV21, [vptr1], #0x10\n"
-        "fmla vV31.4s, vU41.4s, vW21.4s\n"
-        "fmla vV41.4s, vU41.4s, vW11.4s\n"
-        "fmla vV31.4s, vU51.4s, vW31.4s\n"
-        "str qV31, [vptr2], #0x10\n"
-        "fmla vV41.4s, vU51.4s, vW21.4s\n"
-        "fmla vV41.4s, vU61.4s, vW31.4s\n"
-        "str qV41, [vptr3], #0x10\n"
-
-      ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
-      ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
-      ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
-      ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
-      ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
-      ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
-      ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
-      ".unreq qV22\n" ".unreq qU14\n"
-      ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
-      ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
-      ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
-      ".unreq qW33\n"
-      ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
-      ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
-      ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
-      ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
-      ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
-      ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
-      ".unreq qU53\n" ".unreq vW22\n"
-      ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
-      ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
-      ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
-      ".unreq vV12\n" ".unreq vU61\n"
-      ".unreq vU26\n" ".unreq vV32\n"
-      ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
-      ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
-      ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
-      ".unreq vV22\n" ".unreq vU14\n"
-      ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
-      ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
-      ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
-      ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
-      ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
-      ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
-      ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
-      ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
-      ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
-      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
-        [c4_rem] "+r" (c4_rem)
-      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
-        [u_col_stride1] "r" (in_col_stride * sizeof(float)),
-        [v_row_stride] "r" (out_row_stride * sizeof(float)),
-        [v_col_stride1] "r" (out_col_stride * sizeof(float)),
-        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
-        [w_col_stride1] "r" (weight_col_stride * sizeof(float))
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
-        "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
-        "x12", "x13", "x14", "x15", "x16", "cc", "memory"
-    );
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load input tile
-    float u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      const float* const inptr_row = uptr0 + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<float>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
-      }
-    }
-    uptr0++;
-
-    // Load weights tile
-    float w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
-    {
-      const float* const wptr_row = wptr0 + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
-      {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
-      }
-    }
-    wptr0++;
-
-    // Perform the convolution
-    float v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
-    {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = static_cast<float>(0);
-
-        // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
-
-        // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
-        {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-      }
-    }
-
-    // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
-    {
-      float* const outptr_row = vptr0 + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    vptr0++;
-  }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+  int n_channels,
+  const void *weight_bias_ptr,
+  const float *input,
+  const unsigned int input_row_stride,
+  const unsigned int input_col_stride,
+  float *output,
+  const unsigned int output_row_stride,
+  const unsigned int output_col_stride
+)
+{
+  __asm __volatile(
+    "add x24, %[inptr0], %[input_row_stride]\n"
+    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x8, %[outptr0], %[output_row_stride]\n"
+    "add x9, x24, %[input_row_stride]\n"
+    "add x10, x13, #64\n"
+    "add x19, x13, %[input_col_stride1]\n"
+    "add x20, x9, %[input_row_stride]\n"
+    "add x21, x19, #64\n"
+    "add x17, x19, %[input_col_stride1]\n"
+    "add x22, x20, %[input_row_stride]\n"
+    "add x18, x17, #64\n"
+    "add x11, x17, %[input_col_stride1]\n"
+    "add x23, x22, %[input_row_stride]\n"
+    "add x12, x11, #64\n"
+    "add x25, x8, %[output_row_stride]\n"
+    "add x26, x25, %[output_row_stride]\n"
+    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
+    "and x14, %[n_channels], #3\n"
+    "add x28, x27, %[output_col_stride1]\n"
+    "lsr x15, %[n_channels], #2\n"
+    "cbz x15, 4f\n"
+    "1:\n"
+    "ldr q23, [%[wbptr]]\n"
+    "subs x15, x15, #1\n"
+    "mov v12.16b, v23.16b\n"
+    "ldr q20, [%[wbptr], #16]\n"
+    "mov v8.16b, v23.16b\n"
+    "ldr q6, [%[wbptr], #32]\n"
+    "mov v11.16b, v23.16b\n"
+    "ldr q5, [%[wbptr], #48]\n"
+    "mov v16.16b, v23.16b\n"
+    "ldr q19, [%[wbptr], #64]\n"
+    "mov v7.16b, v23.16b\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "mov v10.16b, v23.16b\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "mov v14.16b, v23.16b\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "mov v15.16b, v23.16b\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "mov v17.16b, v23.16b\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "mov v9.16b, v23.16b\n"
+    "ldr q28, [%[inptr0]]\n"
+    "fmla v12.4s, v28.4s, v20.4s\n"
+    "ldr q25, [x24]\n"
+    "fmla v8.4s, v25.4s, v20.4s\n"
+    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v11.4s, v18.4s, v20.4s\n"
+    "ldr q30, [x9]\n"
+    "fmla v12.4s, v25.4s, v19.4s\n"
+    "ldr q29, [x24, %[input_col_stride1]]\n"
+    "fmla v8.4s, v30.4s, v19.4s\n"
+    "ldr q24, [%[inptr0], x13]\n"
+    "fmla v16.4s, v30.4s, v20.4s\n"
+    "ldr q27, [x20]\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "ldr q22, [x9, %[input_col_stride1]]\n"
+    "fmla v8.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v12.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "prfm pldl1keep, [x24, x16]\n"
+    "prfm pldl1keep, [%[inptr0], x10]\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v12.4s, v29.4s, v4.4s\n"
+    "beq 3f\n"
+    "2:\n"
+    "mov v13.16b, v23.16b\n"
+    "ldr q21, [x24, x13]\n"
+    "mov v18.16b, v23.16b\n"
+    "prfm pldl1keep, [x24, x10]\n"
+    "fmla v11.4s, v29.4s, v19.4s\n"
+    "prfm pldl1keep, [%[inptr0], x21]\n"
+    "fmla v7.4s, v29.4s, v20.4s\n"
+    "ldr q25, [%[inptr0], x19]\n"
+    "fmla v12.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v11.4s, v24.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v10.4s, v24.4s, v20.4s\n"
+    "ldr q24, [x22]\n"
+    "fmla v8.4s, v27.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x10]\n"
+    "fmla v16.4s, v27.4s, v19.4s\n"
+    "prfm pldl1keep, [x24, x21]\n"
+    "fmla v14.4s, v27.4s, v20.4s\n"
+    "ldr q26, [x20, %[input_col_stride1]]\n"
+    "fmla v12.4s, v22.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v8.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v11.4s, v22.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x10]\n"
+    "fmla v7.4s, v22.4s, v19.4s\n"
+    "prfm pldl1keep, [x9, x21]\n"
+    "fmla v15.4s, v22.4s, v20.4s\n"
+    "ldr q30, [x9, x13]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x24, x18]\n"
+    "fmla v8.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x12]\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v7.4s, v21.4s, v6.4s\n"
+    "prfm pldl1keep, [x22, x10]\n"
+    "fmla v10.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [x20, x21]\n"
+    "fmla v17.4s, v21.4s, v20.4s\n"
+    "ldr q22, [x24, x19]\n"
+    "fmla v11.4s, v25.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v10.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "fmla v9.4s, v25.4s, v20.4s\n"
+    "ldr q21, [%[inptr0], x17]\n"
+    "fmla v16.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x10]\n"
+    "fmla v14.4s, v24.4s, v19.4s\n"
+    "ldr q24, [x23]\n"
+    "fmla v8.4s, v26.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x21]\n"
+    "fmla v16.4s, v26.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v7.4s, v26.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x12]\n"
+    "fmla v14.4s, v26.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x21]\n"
+    "fmla v15.4s, v26.4s, v19.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v13.4s, v26.4s, v20.4s\n"
+    "ldr q26, [x22, %[input_col_stride1]]\n"
+    "fmla v12.4s, v30.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla v8.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v11.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla v16.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmla v7.4s, v30.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v10.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "subs x15, x15, #1\n"
+    "fmla v17.4s, v30.4s, v19.4s\n"
+    "fmla v18.4s, v30.4s, v20.4s\n"
+    "mov v25.16b, v23.16b\n"
+    "fmla v11.4s, v22.4s, v3.4s\n"
+    "fmla v7.4s, v22.4s, v5.4s\n"
+    "fmla v10.4s, v22.4s, v4.4s\n"
+    "fmla v17.4s, v22.4s, v6.4s\n"
+    "fmla v9.4s, v22.4s, v19.4s\n"
+    "fmla v25.4s, v22.4s, v20.4s\n"
+    "ldr q27, [x20, x13]\n"
+    "fmla v10.4s, v21.4s, v5.4s\n"
+    "fmla v14.4s, v24.4s, v2.4s\n"
+    "mov v22.16b, v23.16b\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v21.16b, v23.16b\n"
+    "fmla v16.4s, v26.4s, v1.4s\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v13.4s, v26.4s, v19.4s\n"
+    "fmla v8.4s, v27.4s, v0.4s\n"
+    "ldr q28, [x9, x19]\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v7.4s, v27.4s, v1.4s\n"
+    "fmla v14.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v18.4s, v27.4s, v19.4s\n"
+    "fmla v22.4s, v27.4s, v20.4s\n"
+    "fmla v11.4s, v28.4s, v0.4s\n"
+    "ldr q29, [x24, x17]\n"
+    "fmla v7.4s, v28.4s, v3.4s\n"
+    "fmla v10.4s, v28.4s, v1.4s\n"
+    "fmla v15.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v28.4s, v4.4s\n"
+    "fmla v9.4s, v28.4s, v2.4s\n"
+    "fmla v18.4s, v28.4s, v6.4s\n"
+    "fmla v25.4s, v28.4s, v19.4s\n"
+    "fmla v24.4s, v28.4s, v20.4s\n"
+    "fmla v10.4s, v29.4s, v3.4s\n"
+    "ldr q23, [%[inptr0], x11]\n"
+    "fmla v17.4s, v29.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v9.4s, v29.4s, v4.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v25.4s, v29.4s, v6.4s\n"
+    "ldr q30, [x23, %[input_col_stride1]]\n"
+    "fmla v14.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v9.4s, v23.4s, v5.4s\n"
+    "ldr q23, [x22, x13]\n"
+    "fmla v13.4s, v30.4s, v2.4s\n"
+    "ldr q29, [x20, x19]\n"
+    "fmla v16.4s, v23.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x10]\n"
+    "fmla v14.4s, v23.4s, v3.4s\n"
+    "fmla v15.4s, v23.4s, v1.4s\n"
+    "fmla v13.4s, v23.4s, v4.4s\n"
+    "fmla v18.4s, v23.4s, v2.4s\n"
+    "fmla v22.4s, v23.4s, v19.4s\n"
+    "ldr q23, [x9, x17]\n"
+    "fmla v7.4s, v29.4s, v0.4s\n"
+    "fmla v15.4s, v29.4s, v3.4s\n"
+    "fmla v17.4s, v29.4s, v1.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v2.4s\n"
+    "fmla v22.4s, v29.4s, v6.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v21.4s, v29.4s, v20.4s\n"
+    "ldr q26, [x24, x11]\n"
+    "fmla v10.4s, v23.4s, v0.4s\n"
+    "ldr q28, [x23, x13]\n"
+    "fmla v17.4s, v23.4s, v3.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v9.4s, v23.4s, v1.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v18.4s, v23.4s, v5.4s\n"
+    "prfm pldl1keep, [x24, x16]\n"
+    "fmla v25.4s, v23.4s, v4.4s\n"
+    "fmla v24.4s, v23.4s, v6.4s\n"
+    "fmla v9.4s, v26.4s, v3.4s\n"
+    "ldr q20, [x22, x19]\n"
+    "fmla v14.4s, v28.4s, v0.4s\n"
+    "fmla v13.4s, v28.4s, v1.4s\n"
+    "fmla v25.4s, v26.4s, v5.4s\n"
+    "ldr q26, [x20, x17]\n"
+    "fmla v22.4s, v28.4s, v2.4s\n"
+    "ldr q23, [x9, x11]\n"
+    "fmla v15.4s, v20.4s, v0.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v13.4s, v20.4s, v3.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "fmla v18.4s, v20.4s, v1.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v22.4s, v20.4s, v4.4s\n"
+    "fmla v24.4s, v20.4s, v2.4s\n"
+    "fmla v21.4s, v20.4s, v19.4s\n"
+    "ldr q27, [x23, x19]\n"
+    "fmla v17.4s, v26.4s, v0.4s\n"
+    "ldr q20, [x22, x17]\n"
+    "fmla v18.4s, v26.4s, v3.4s\n"
+    "fmla v25.4s, v26.4s, v1.4s\n"
+    "fmla v22.4s, v26.4s, v5.4s\n"
+    "fmla v24.4s, v26.4s, v4.4s\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr q19, [x20, x11]\n"
+    "fmla v9.4s, v23.4s, v0.4s\n"
+    "ldr q28, [x23, x17]\n"
+    "fmla v25.4s, v23.4s, v3.4s\n"
+    "add x20, x20, #16\n"
+    "fmla v24.4s, v23.4s, v5.4s\n"
+    "ldr q29, [x22, x11]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "fmla v22.4s, v27.4s, v1.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v21.4s, v27.4s, v2.4s\n"
+    "ldr q30, [x23, x11]\n"
+    "fmla v18.4s, v20.4s, v0.4s\n"
+    "ldr q23, [%[wbptr]]\n"
+    "fmla v22.4s, v20.4s, v3.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v24.4s, v20.4s, v1.4s\n"
+    "fmla v21.4s, v20.4s, v4.4s\n"
+    "fmla v25.4s, v19.4s, v0.4s\n"
+    "ldr q20, [%[wbptr], #16]\n"
+    "fmla v22.4s, v28.4s, v0.4s\n"
+    "ldr q6, [%[wbptr], #32]\n"
+    "fmla v21.4s, v19.4s, v5.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v24.4s, v19.4s, v3.4s\n"
+    "ldr q19, [%[wbptr], #64]\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v11.4s, v11.4s, v26.4s\n"
+    "fmla v21.4s, v28.4s, v1.4s\n"
+    "ldr q5, [%[wbptr], #48]\n"
+    "fmla v24.4s, v29.4s, v0.4s\n"
+    "ldr q4, [%[wbptr], #80]\n"
+    "fmax v10.4s, v10.4s, v26.4s\n"
+    "fmax v9.4s, v9.4s, v26.4s\n"
+    "fmla v21.4s, v29.4s, v3.4s\n"
+    "ldr q2, [%[wbptr], #112]\n"
+    "fmov v27.4s, #6.0\n"
+    "fmax v8.4s, v8.4s, v26.4s\n"
+    "fmax v7.4s, v7.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmla v21.4s, v30.4s, v0.4s\n"
+    "ldr q3, [%[wbptr], #96]\n"
+    "fmin v12.4s, v12.4s, v27.4s\n"
+    "ldr q1, [%[wbptr], #128]\n"
+    "fmin v11.4s, v11.4s, v27.4s\n"
+    "fmin v10.4s, v10.4s, v27.4s\n"
+    "str q12, [%[outptr0]]\n"
+    "fmin v9.4s, v9.4s, v27.4s\n"
+    "str q11, [%[outptr0], %[output_col_stride1]]\n"
+    "fmin v8.4s, v8.4s, v27.4s\n"
+    "str q10, [%[outptr0], x27]\n"
+    "fmin v7.4s, v7.4s, v27.4s\n"
+    "str q9, [%[outptr0], x28]\n"
+    "fmin v17.4s, v17.4s, v27.4s\n"
+    "str q8, [x8]\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "str q7, [x8, %[output_col_stride1]]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str q17, [x8, x27]\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "fmin v16.4s, v16.4s, v27.4s\n"
+    "ldr q0, [%[wbptr], #144]\n"
+    "str q25, [x8, x28]\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q16, [x25]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmin v15.4s, v15.4s, v27.4s\n"
+    "ldr q28, [%[inptr0]]\n"
+    "fmin v18.4s, v18.4s, v27.4s\n"
+    "ldr q25, [x24]\n"
+    "str q15, [x25, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "str q18, [x25, x27]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
+    "fmin v14.4s, v14.4s, v27.4s\n"
+    "ldr q30, [x9]\n"
+    "str q24, [x25, x28]\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q14, [x26]\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "fmin v13.4s, v13.4s, v27.4s\n"
+    "ldr q29, [x24, %[input_col_stride1]]\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "ldr q24, [%[inptr0], x13]\n"
+    "str q13, [x26, %[output_col_stride1]]\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "str q22, [x26, x27]\n"
+    "mov v12.16b, v23.16b\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "ldr q27, [x20]\n"
+    "mov v8.16b, v23.16b\n"
+    "ldr q22, [x9, %[input_col_stride1]]\n"
+    "str q21, [x26, x28]\n"
+    "mov v11.16b, v23.16b\n"
+    "mov v16.16b, v23.16b\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "mov v7.16b, v23.16b\n"
+    "add x8, x8, #16\n"
+    "mov v10.16b, v23.16b\n"
+    "add x25, x25, #16\n"
+    "mov v14.16b, v23.16b\n"
+    "add x26, x26, #16\n"
+    "mov v15.16b, v23.16b\n"
+    "mov v17.16b, v23.16b\n"
+    "mov v9.16b, v23.16b\n"
+    "fmla v12.4s, v28.4s, v20.4s\n"
+    "fmla v8.4s, v25.4s, v20.4s\n"
+    "fmla v11.4s, v18.4s, v20.4s\n"
+    "fmla v16.4s, v30.4s, v20.4s\n"
+    "fmla v12.4s, v25.4s, v19.4s\n"
+    "fmla v8.4s, v30.4s, v19.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "fmla v8.4s, v29.4s, v6.4s\n"
+    "fmla v12.4s, v30.4s, v2.4s\n"
+    "fmla v12.4s, v29.4s, v4.4s\n"
+    "bne 2b\n"
+    "3:\n"
+    "mov v13.16b, v23.16b\n"
+    "ldr q21, [x24, x13]\n"
+    "mov v18.16b, v23.16b\n"
+    "prfm pldl1keep, [x24, x10]\n"
+    "fmla v11.4s, v29.4s, v19.4s\n"
+    "prfm pldl1keep, [%[inptr0], x21]\n"
+    "fmla v7.4s, v29.4s, v20.4s\n"
+    "ldr q25, [%[inptr0], x19]\n"
+    "fmla v12.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v11.4s, v24.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v10.4s, v24.4s, v20.4s\n"
+    "ldr q24, [x22]\n"
+    "fmla v8.4s, v27.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x10]\n"
+    "fmla v16.4s, v27.4s, v19.4s\n"
+    "prfm pldl1keep, [x24, x21]\n"
+    "fmla v14.4s, v27.4s, v20.4s\n"
+    "ldr q26, [x20, %[input_col_stride1]]\n"
+    "fmla v12.4s, v22.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v8.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v11.4s, v22.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x10]\n"
+    "fmla v7.4s, v22.4s, v19.4s\n"
+    "prfm pldl1keep, [x9, x21]\n"
+    "fmla v15.4s, v22.4s, v20.4s\n"
+    "ldr q30, [x9, x13]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x24, x18]\n"
+    "fmla v8.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x12]\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v7.4s, v21.4s, v6.4s\n"
+    "prfm pldl1keep, [x22, x10]\n"
+    "fmla v10.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [x20, x21]\n"
+    "fmla v17.4s, v21.4s, v20.4s\n"
+    "ldr q22, [x24, x19]\n"
+    "fmla v11.4s, v25.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v10.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "fmla v9.4s, v25.4s, v20.4s\n"
+    "ldr q21, [%[inptr0], x17]\n"
+    "fmla v16.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x10]\n"
+    "fmla v14.4s, v24.4s, v19.4s\n"
+    "ldr q24, [x23]\n"
+    "fmla v8.4s, v26.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x21]\n"
+    "fmla v16.4s, v26.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v7.4s, v26.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x12]\n"
+    "fmla v14.4s, v26.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x21]\n"
+    "fmla v15.4s, v26.4s, v19.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v13.4s, v26.4s, v20.4s\n"
+    "ldr q26, [x22, %[input_col_stride1]]\n"
+    "fmla v12.4s, v30.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla v8.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v11.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla v16.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmla v7.4s, v30.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #160\n"
+    "fmla v10.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v19.4s\n"
+    "fmla v18.4s, v30.4s, v20.4s\n"
+    "ldr q27, [x20, x13]\n"
+    "fmla v11.4s, v22.4s, v3.4s\n"
+    "fmla v7.4s, v22.4s, v5.4s\n"
+    "fmla v10.4s, v22.4s, v4.4s\n"
+    "fmla v17.4s, v22.4s, v6.4s\n"
+    "fmla v9.4s, v22.4s, v19.4s\n"
+    "fmla v14.4s, v24.4s, v2.4s\n"
+    "mov v25.16b, v23.16b\n"
+    "fmla v16.4s, v26.4s, v1.4s\n"
+    "fmla v10.4s, v21.4s, v5.4s\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v25.4s, v22.4s, v20.4s\n"
+    "ldr q28, [x9, x19]\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "ldr q29, [x24, x17]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "fmla v13.4s, v26.4s, v19.4s\n"
+    "mov v22.16b, v23.16b\n"
+    "fmla v8.4s, v27.4s, v0.4s\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v7.4s, v27.4s, v1.4s\n"
+    "fmla v14.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v18.4s, v27.4s, v19.4s\n"
+    "fmla v22.4s, v27.4s, v20.4s\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v21.16b, v23.16b\n"
+    "fmla v11.4s, v28.4s, v0.4s\n"
+    "fmla v7.4s, v28.4s, v3.4s\n"
+    "fmla v10.4s, v28.4s, v1.4s\n"
+    "fmla v15.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v28.4s, v4.4s\n"
+    "fmla v9.4s, v28.4s, v2.4s\n"
+    "fmla v18.4s, v28.4s, v6.4s\n"
+    "fmla v25.4s, v28.4s, v19.4s\n"
+    "fmla v24.4s, v28.4s, v20.4s\n"
+    "ldr q23, [%[inptr0], x11]\n"
+    "fmla v10.4s, v29.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "fmla v17.4s, v29.4s, v5.4s\n"
+    "fmla v9.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v6.4s\n"
+    "ldr q30, [x23, %[input_col_stride1]]\n"
+    "fmla v14.4s, v30.4s, v1.4s\n"
+    "fmla v13.4s, v30.4s, v2.4s\n"
+    "fmla v9.4s, v23.4s, v5.4s\n"
+    "ldr q23, [x22, x13]\n"
+    "fmla v16.4s, v23.4s, v0.4s\n"
+    "ldr q29, [x20, x19]\n"
+    "fmla v14.4s, v23.4s, v3.4s\n"
+    "fmla v15.4s, v23.4s, v1.4s\n"
+    "fmla v13.4s, v23.4s, v4.4s\n"
+    "fmla v18.4s, v23.4s, v2.4s\n"
+    "fmla v22.4s, v23.4s, v19.4s\n"
+    "ldr q23, [x9, x17]\n"
+    "fmla v7.4s, v29.4s, v0.4s\n"
+    "fmla v15.4s, v29.4s, v3.4s\n"
+    "fmla v17.4s, v29.4s, v1.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v2.4s\n"
+    "fmla v22.4s, v29.4s, v6.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v21.4s, v29.4s, v20.4s\n"
+    "ldr q26, [x24, x11]\n"
+    "fmla v10.4s, v23.4s, v0.4s\n"
+    "ldr q28, [x23, x13]\n"
+    "fmla v17.4s, v23.4s, v3.4s\n"
+    "add x24, x24, #16\n"
+    "fmla v9.4s, v23.4s, v1.4s\n"
+    "fmla v18.4s, v23.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v4.4s\n"
+    "fmla v24.4s, v23.4s, v6.4s\n"
+    "fmla v14.4s, v28.4s, v0.4s\n"
+    "ldr q20, [x22, x19]\n"
+    "fmla v9.4s, v26.4s, v3.4s\n"
+    "fmla v13.4s, v28.4s, v1.4s\n"
+    "fmla v25.4s, v26.4s, v5.4s\n"
+    "ldr q26, [x20, x17]\n"
+    "fmla v22.4s, v28.4s, v2.4s\n"
+    "ldr q23, [x9, x11]\n"
+    "fmla v15.4s, v20.4s, v0.4s\n"
+    "add x9, x9, #16\n"
+    "fmla v13.4s, v20.4s, v3.4s\n"
+    "fmla v18.4s, v20.4s, v1.4s\n"
+    "fmla v22.4s, v20.4s, v4.4s\n"
+    "fmla v24.4s, v20.4s, v2.4s\n"
+    "fmla v21.4s, v20.4s, v19.4s\n"
+    "ldr q27, [x23, x19]\n"
+    "fmla v17.4s, v26.4s, v0.4s\n"
+    "ldr q20, [x22, x17]\n"
+    "fmla v18.4s, v26.4s, v3.4s\n"
+    "fmla v25.4s, v26.4s, v1.4s\n"
+    "fmla v22.4s, v26.4s, v5.4s\n"
+    "fmla v24.4s, v26.4s, v4.4s\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr q19, [x20, x11]\n"
+    "fmla v9.4s, v23.4s, v0.4s\n"
+    "ldr q28, [x23, x17]\n"
+    "fmla v25.4s, v23.4s, v3.4s\n"
+    "add x20, x20, #16\n"
+    "fmla v24.4s, v23.4s, v5.4s\n"
+    "ldr q29, [x22, x11]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "add x22, x22, #16\n"
+    "fmla v22.4s, v27.4s, v1.4s\n"
+    "fmla v21.4s, v27.4s, v2.4s\n"
+    "fmla v18.4s, v20.4s, v0.4s\n"
+    "ldr q30, [x23, x11]\n"
+    "fmla v24.4s, v20.4s, v1.4s\n"
+    "add x23, x23, #16\n"
+    "fmla v22.4s, v20.4s, v3.4s\n"
+    "fmla v21.4s, v20.4s, v4.4s\n"
+    "fmla v25.4s, v19.4s, v0.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v24.4s, v19.4s, v3.4s\n"
+    "fmov v27.4s, #6.0\n"
+    "fmla v21.4s, v19.4s, v5.4s\n"
+    "fmla v22.4s, v28.4s, v0.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v11.4s, v11.4s, v26.4s\n"
+    "fmla v24.4s, v29.4s, v0.4s\n"
+    "fmax v10.4s, v10.4s, v26.4s\n"
+    "fmla v21.4s, v28.4s, v1.4s\n"
+    "fmin v12.4s, v12.4s, v27.4s\n"
+    "fmin v11.4s, v11.4s, v27.4s\n"
+    "fmin v10.4s, v10.4s, v27.4s\n"
+    "str q12, [%[outptr0]]\n"
+    "fmax v9.4s, v9.4s, v26.4s\n"
+    "str q11, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v21.4s, v29.4s, v3.4s\n"
+    "str q10, [%[outptr0], x27]\n"
+    "fmin v9.4s, v9.4s, v27.4s\n"
+    "fmax v8.4s, v8.4s, v26.4s\n"
+    "fmax v7.4s, v7.4s, v26.4s\n"
+    "str q9, [%[outptr0], x28]\n"
+    "fmla v21.4s, v30.4s, v0.4s\n"
+    "fmin v8.4s, v8.4s, v27.4s\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "fmin v7.4s, v7.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str q8, [x8]\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "str q7, [x8, %[output_col_stride1]]\n"
+    "fmin v17.4s, v17.4s, v27.4s\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str q17, [x8, x27]\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str q25, [x8, x28]\n"
+    "fmin v16.4s, v16.4s, v27.4s\n"
+    "fmin v15.4s, v15.4s, v27.4s\n"
+    "add x8, x8, #16\n"
+    "str q16, [x25]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "str q15, [x25, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "fmin v18.4s, v18.4s, v27.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str q18, [x25, x27]\n"
+    "fmin v14.4s, v14.4s, v27.4s\n"
+    "str q24, [x25, x28]\n"
+    "fmin v13.4s, v13.4s, v27.4s\n"
+    "str q14, [x26]\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "str q13, [x26, %[output_col_stride1]]\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "add x25, x25, #16\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "str q22, [x26, x27]\n"
+    "str q21, [x26, x28]\n"
+    "add x26, x26, #16\n"
+    "4:\n"
+    "cbz x14, 7f\n"
+    "ldr s23, [%[wbptr]]\n"
+    "mov v12.16b, v23.16b\n"
+    "ldr s20, [%[wbptr], #4]\n"
+    "mov v8.16b, v23.16b\n"
+    "ldr s6, [%[wbptr], #8]\n"
+    "mov v11.16b, v23.16b\n"
+    "ldr s5, [%[wbptr], #12]\n"
+    "mov v16.16b, v23.16b\n"
+    "ldr s19, [%[wbptr], #16]\n"
+    "mov v7.16b, v23.16b\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "mov v10.16b, v23.16b\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "mov v14.16b, v23.16b\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "mov v15.16b, v23.16b\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "mov v17.16b, v23.16b\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "mov v9.16b, v23.16b\n"
+    "ldr s28, [%[inptr0]]\n"
+    "fmla v12.4s, v28.4s, v20.4s\n"
+    "ldr s25, [x24]\n"
+    "fmla v8.4s, v25.4s, v20.4s\n"
+    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
+    "fmla v11.4s, v18.4s, v20.4s\n"
+    "ldr s30, [x9]\n"
+    "fmla v12.4s, v25.4s, v19.4s\n"
+    "ldr s29, [x24, %[input_col_stride1]]\n"
+    "fmla v8.4s, v30.4s, v19.4s\n"
+    "ldr s24, [%[inptr0], x13]\n"
+    "fmla v16.4s, v30.4s, v20.4s\n"
+    "ldr s27, [x20]\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "ldr s22, [x9, %[input_col_stride1]]\n"
+    "fmla v8.4s, v29.4s, v6.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "subs x14, x14, #1\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "fmla v12.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [x24, x16]\n"
+    "prfm pldl1keep, [%[inptr0], x10]\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v12.4s, v29.4s, v4.4s\n"
+    "beq 6f\n"
+    "5:\n"
+    "mov v13.16b, v23.16b\n"
+    "ldr s21, [x24, x13]\n"
+    "mov v18.16b, v23.16b\n"
+    "prfm pldl1keep, [x24, x10]\n"
+    "fmla v11.4s, v29.4s, v19.4s\n"
+    "prfm pldl1keep, [%[inptr0], x21]\n"
+    "fmla v7.4s, v29.4s, v20.4s\n"
+    "ldr s25, [%[inptr0], x19]\n"
+    "fmla v12.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v11.4s, v24.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v10.4s, v24.4s, v20.4s\n"
+    "ldr s24, [x22]\n"
+    "fmla v8.4s, v27.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x10]\n"
+    "fmla v16.4s, v27.4s, v19.4s\n"
+    "prfm pldl1keep, [x24, x21]\n"
+    "fmla v14.4s, v27.4s, v20.4s\n"
+    "ldr s26, [x20, %[input_col_stride1]]\n"
+    "fmla v12.4s, v22.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v8.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v11.4s, v22.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x10]\n"
+    "fmla v7.4s, v22.4s, v19.4s\n"
+    "prfm pldl1keep, [x9, x21]\n"
+    "fmla v15.4s, v22.4s, v20.4s\n"
+    "ldr s30, [x9, x13]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x24, x18]\n"
+    "fmla v8.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x12]\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v7.4s, v21.4s, v6.4s\n"
+    "prfm pldl1keep, [x22, x10]\n"
+    "fmla v10.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [x20, x21]\n"
+    "fmla v17.4s, v21.4s, v20.4s\n"
+    "ldr s22, [x24, x19]\n"
+    "fmla v11.4s, v25.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v10.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "fmla v9.4s, v25.4s, v20.4s\n"
+    "ldr s21, [%[inptr0], x17]\n"
+    "fmla v16.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x10]\n"
+    "fmla v14.4s, v24.4s, v19.4s\n"
+    "ldr s24, [x23]\n"
+    "fmla v8.4s, v26.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x21]\n"
+    "fmla v16.4s, v26.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v7.4s, v26.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x12]\n"
+    "fmla v14.4s, v26.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x21]\n"
+    "fmla v15.4s, v26.4s, v19.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v13.4s, v26.4s, v20.4s\n"
+    "ldr s26, [x22, %[input_col_stride1]]\n"
+    "fmla v12.4s, v30.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla v8.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v11.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla v16.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmla v7.4s, v30.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v10.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "subs x14, x14, #1\n"
+    "fmla v17.4s, v30.4s, v19.4s\n"
+    "fmla v18.4s, v30.4s, v20.4s\n"
+    "mov v25.16b, v23.16b\n"
+    "fmla v11.4s, v22.4s, v3.4s\n"
+    "fmla v7.4s, v22.4s, v5.4s\n"
+    "fmla v10.4s, v22.4s, v4.4s\n"
+    "fmla v17.4s, v22.4s, v6.4s\n"
+    "fmla v9.4s, v22.4s, v19.4s\n"
+    "fmla v25.4s, v22.4s, v20.4s\n"
+    "ldr s27, [x20, x13]\n"
+    "fmla v10.4s, v21.4s, v5.4s\n"
+    "fmla v14.4s, v24.4s, v2.4s\n"
+    "mov v22.16b, v23.16b\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v21.16b, v23.16b\n"
+    "fmla v16.4s, v26.4s, v1.4s\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v13.4s, v26.4s, v19.4s\n"
+    "fmla v8.4s, v27.4s, v0.4s\n"
+    "ldr s28, [x9, x19]\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v7.4s, v27.4s, v1.4s\n"
+    "fmla v14.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v18.4s, v27.4s, v19.4s\n"
+    "fmla v22.4s, v27.4s, v20.4s\n"
+    "fmla v11.4s, v28.4s, v0.4s\n"
+    "ldr s29, [x24, x17]\n"
+    "fmla v7.4s, v28.4s, v3.4s\n"
+    "fmla v10.4s, v28.4s, v1.4s\n"
+    "fmla v15.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v28.4s, v4.4s\n"
+    "fmla v9.4s, v28.4s, v2.4s\n"
+    "fmla v18.4s, v28.4s, v6.4s\n"
+    "fmla v25.4s, v28.4s, v19.4s\n"
+    "fmla v24.4s, v28.4s, v20.4s\n"
+    "fmla v10.4s, v29.4s, v3.4s\n"
+    "ldr s23, [%[inptr0], x11]\n"
+    "fmla v17.4s, v29.4s, v5.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v9.4s, v29.4s, v4.4s\n"
+    "prfm pldl1keep, [%[inptr0], #64]\n"
+    "fmla v25.4s, v29.4s, v6.4s\n"
+    "ldr s30, [x23, %[input_col_stride1]]\n"
+    "fmla v14.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x16]\n"
+    "fmla v9.4s, v23.4s, v5.4s\n"
+    "ldr s23, [x22, x13]\n"
+    "fmla v13.4s, v30.4s, v2.4s\n"
+    "ldr s29, [x20, x19]\n"
+    "fmla v16.4s, v23.4s, v0.4s\n"
+    "prfm pldl1keep, [%[inptr0], x10]\n"
+    "fmla v14.4s, v23.4s, v3.4s\n"
+    "fmla v15.4s, v23.4s, v1.4s\n"
+    "fmla v13.4s, v23.4s, v4.4s\n"
+    "fmla v18.4s, v23.4s, v2.4s\n"
+    "fmla v22.4s, v23.4s, v19.4s\n"
+    "ldr s23, [x9, x17]\n"
+    "fmla v7.4s, v29.4s, v0.4s\n"
+    "fmla v15.4s, v29.4s, v3.4s\n"
+    "fmla v17.4s, v29.4s, v1.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v2.4s\n"
+    "fmla v22.4s, v29.4s, v6.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v21.4s, v29.4s, v20.4s\n"
+    "ldr s26, [x24, x11]\n"
+    "fmla v10.4s, v23.4s, v0.4s\n"
+    "ldr s28, [x23, x13]\n"
+    "fmla v17.4s, v23.4s, v3.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v9.4s, v23.4s, v1.4s\n"
+    "prfm pldl1keep, [x24, #64]\n"
+    "fmla v18.4s, v23.4s, v5.4s\n"
+    "prfm pldl1keep, [x24, x16]\n"
+    "fmla v25.4s, v23.4s, v4.4s\n"
+    "fmla v24.4s, v23.4s, v6.4s\n"
+    "fmla v9.4s, v26.4s, v3.4s\n"
+    "ldr s20, [x22, x19]\n"
+    "fmla v14.4s, v28.4s, v0.4s\n"
+    "fmla v13.4s, v28.4s, v1.4s\n"
+    "fmla v25.4s, v26.4s, v5.4s\n"
+    "ldr s26, [x20, x17]\n"
+    "fmla v22.4s, v28.4s, v2.4s\n"
+    "ldr s23, [x9, x11]\n"
+    "fmla v15.4s, v20.4s, v0.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v13.4s, v20.4s, v3.4s\n"
+    "prfm pldl1keep, [x9, #64]\n"
+    "fmla v18.4s, v20.4s, v1.4s\n"
+    "prfm pldl1keep, [x9, x16]\n"
+    "fmla v22.4s, v20.4s, v4.4s\n"
+    "fmla v24.4s, v20.4s, v2.4s\n"
+    "fmla v21.4s, v20.4s, v19.4s\n"
+    "ldr s27, [x23, x19]\n"
+    "fmla v17.4s, v26.4s, v0.4s\n"
+    "ldr s20, [x22, x17]\n"
+    "fmla v18.4s, v26.4s, v3.4s\n"
+    "fmla v25.4s, v26.4s, v1.4s\n"
+    "fmla v22.4s, v26.4s, v5.4s\n"
+    "fmla v24.4s, v26.4s, v4.4s\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr s19, [x20, x11]\n"
+    "fmla v9.4s, v23.4s, v0.4s\n"
+    "ldr s28, [x23, x17]\n"
+    "fmla v25.4s, v23.4s, v3.4s\n"
+    "add x20, x20, #4\n"
+    "fmla v24.4s, v23.4s, v5.4s\n"
+    "ldr s29, [x22, x11]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, #64]\n"
+    "fmla v22.4s, v27.4s, v1.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v21.4s, v27.4s, v2.4s\n"
+    "ldr s30, [x23, x11]\n"
+    "fmla v18.4s, v20.4s, v0.4s\n"
+    "ldr s23, [%[wbptr]]\n"
+    "fmla v22.4s, v20.4s, v3.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v24.4s, v20.4s, v1.4s\n"
+    "fmla v21.4s, v20.4s, v4.4s\n"
+    "fmla v25.4s, v19.4s, v0.4s\n"
+    "ldr s20, [%[wbptr], #4]\n"
+    "fmla v22.4s, v28.4s, v0.4s\n"
+    "ldr s6, [%[wbptr], #8]\n"
+    "fmla v21.4s, v19.4s, v5.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v24.4s, v19.4s, v3.4s\n"
+    "ldr s19, [%[wbptr], #16]\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v11.4s, v11.4s, v26.4s\n"
+    "fmla v21.4s, v28.4s, v1.4s\n"
+    "ldr s5, [%[wbptr], #12]\n"
+    "fmla v24.4s, v29.4s, v0.4s\n"
+    "ldr s4, [%[wbptr], #20]\n"
+    "fmax v10.4s, v10.4s, v26.4s\n"
+    "fmax v9.4s, v9.4s, v26.4s\n"
+    "fmla v21.4s, v29.4s, v3.4s\n"
+    "ldr s2, [%[wbptr], #28]\n"
+    "fmov v27.4s, #6.0\n"
+    "fmax v8.4s, v8.4s, v26.4s\n"
+    "fmax v7.4s, v7.4s, v26.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "fmla v21.4s, v30.4s, v0.4s\n"
+    "ldr s3, [%[wbptr], #24]\n"
+    "fmin v12.4s, v12.4s, v27.4s\n"
+    "ldr s1, [%[wbptr], #32]\n"
+    "fmin v11.4s, v11.4s, v27.4s\n"
+    "fmin v10.4s, v10.4s, v27.4s\n"
+    "str s12, [%[outptr0]]\n"
+    "fmin v9.4s, v9.4s, v27.4s\n"
+    "str s11, [%[outptr0], %[output_col_stride1]]\n"
+    "fmin v8.4s, v8.4s, v27.4s\n"
+    "str s10, [%[outptr0], x27]\n"
+    "fmin v7.4s, v7.4s, v27.4s\n"
+    "str s9, [%[outptr0], x28]\n"
+    "fmin v17.4s, v17.4s, v27.4s\n"
+    "str s8, [x8]\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "str s7, [x8, %[output_col_stride1]]\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str s17, [x8, x27]\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "fmin v16.4s, v16.4s, v27.4s\n"
+    "ldr s0, [%[wbptr], #36]\n"
+    "str s25, [x8, x28]\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str s16, [x25]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "fmin v15.4s, v15.4s, v27.4s\n"
+    "ldr s28, [%[inptr0]]\n"
+    "fmin v18.4s, v18.4s, v27.4s\n"
+    "ldr s25, [x24]\n"
+    "str s15, [x25, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "str s18, [x25, x27]\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
+    "fmin v14.4s, v14.4s, v27.4s\n"
+    "ldr s30, [x9]\n"
+    "str s24, [x25, x28]\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str s14, [x26]\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "fmin v13.4s, v13.4s, v27.4s\n"
+    "ldr s29, [x24, %[input_col_stride1]]\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "ldr s24, [%[inptr0], x13]\n"
+    "str s13, [x26, %[output_col_stride1]]\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "str s22, [x26, x27]\n"
+    "mov v12.16b, v23.16b\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "ldr s27, [x20]\n"
+    "mov v8.16b, v23.16b\n"
+    "ldr s22, [x9, %[input_col_stride1]]\n"
+    "str s21, [x26, x28]\n"
+    "mov v11.16b, v23.16b\n"
+    "mov v16.16b, v23.16b\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "mov v7.16b, v23.16b\n"
+    "add x8, x8, #4\n"
+    "mov v10.16b, v23.16b\n"
+    "add x25, x25, #4\n"
+    "mov v14.16b, v23.16b\n"
+    "add x26, x26, #4\n"
+    "mov v15.16b, v23.16b\n"
+    "mov v17.16b, v23.16b\n"
+    "mov v9.16b, v23.16b\n"
+    "fmla v12.4s, v28.4s, v20.4s\n"
+    "fmla v8.4s, v25.4s, v20.4s\n"
+    "fmla v11.4s, v18.4s, v20.4s\n"
+    "fmla v16.4s, v30.4s, v20.4s\n"
+    "fmla v12.4s, v25.4s, v19.4s\n"
+    "fmla v8.4s, v30.4s, v19.4s\n"
+    "fmla v12.4s, v18.4s, v6.4s\n"
+    "fmla v8.4s, v29.4s, v6.4s\n"
+    "fmla v12.4s, v30.4s, v2.4s\n"
+    "fmla v12.4s, v29.4s, v4.4s\n"
+    "bne 5b\n"
+    "6:\n"
+    "mov v13.16b, v23.16b\n"
+    "ldr s21, [x24, x13]\n"
+    "mov v18.16b, v23.16b\n"
+    "prfm pldl1keep, [x24, x10]\n"
+    "fmla v11.4s, v29.4s, v19.4s\n"
+    "prfm pldl1keep, [%[inptr0], x21]\n"
+    "fmla v7.4s, v29.4s, v20.4s\n"
+    "ldr s25, [%[inptr0], x19]\n"
+    "fmla v12.4s, v24.4s, v5.4s\n"
+    "prfm pldl1keep, [x22, #64]\n"
+    "fmla v11.4s, v24.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x16]\n"
+    "fmla v10.4s, v24.4s, v20.4s\n"
+    "ldr s24, [x22]\n"
+    "fmla v8.4s, v27.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x10]\n"
+    "fmla v16.4s, v27.4s, v19.4s\n"
+    "prfm pldl1keep, [x24, x21]\n"
+    "fmla v14.4s, v27.4s, v20.4s\n"
+    "ldr s26, [x20, %[input_col_stride1]]\n"
+    "fmla v12.4s, v22.4s, v1.4s\n"
+    "prfm pldl1keep, [%[inptr0], x18]\n"
+    "fmla v8.4s, v22.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, #64]\n"
+    "fmla v11.4s, v22.4s, v2.4s\n"
+    "prfm pldl1keep, [x22, x16]\n"
+    "fmla v16.4s, v22.4s, v6.4s\n"
+    "prfm pldl1keep, [x20, x10]\n"
+    "fmla v7.4s, v22.4s, v19.4s\n"
+    "prfm pldl1keep, [x9, x21]\n"
+    "fmla v15.4s, v22.4s, v20.4s\n"
+    "ldr s30, [x9, x13]\n"
+    "fmla v12.4s, v21.4s, v3.4s\n"
+    "prfm pldl1keep, [x24, x18]\n"
+    "fmla v8.4s, v21.4s, v5.4s\n"
+    "prfm pldl1keep, [%[inptr0], x12]\n"
+    "fmla v11.4s, v21.4s, v4.4s\n"
+    "prfm pldl1keep, [x23, x16]\n"
+    "fmla v7.4s, v21.4s, v6.4s\n"
+    "prfm pldl1keep, [x22, x10]\n"
+    "fmla v10.4s, v21.4s, v19.4s\n"
+    "prfm pldl1keep, [x20, x21]\n"
+    "fmla v17.4s, v21.4s, v20.4s\n"
+    "ldr s22, [x24, x19]\n"
+    "fmla v11.4s, v25.4s, v5.4s\n"
+    "prfm pldl1keep, [x9, x18]\n"
+    "fmla v10.4s, v25.4s, v6.4s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "fmla v9.4s, v25.4s, v20.4s\n"
+    "ldr s21, [%[inptr0], x17]\n"
+    "fmla v16.4s, v24.4s, v2.4s\n"
+    "prfm pldl1keep, [x23, x10]\n"
+    "fmla v14.4s, v24.4s, v19.4s\n"
+    "ldr s24, [x23]\n"
+    "fmla v8.4s, v26.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x21]\n"
+    "fmla v16.4s, v26.4s, v4.4s\n"
+    "prfm pldl1keep, [x20, x18]\n"
+    "fmla v7.4s, v26.4s, v2.4s\n"
+    "prfm pldl1keep, [x9, x12]\n"
+    "fmla v14.4s, v26.4s, v6.4s\n"
+    "prfm pldl1keep, [x23, x21]\n"
+    "fmla v15.4s, v26.4s, v19.4s\n"
+    "prfm pldl1keep, [x22, x18]\n"
+    "fmla v13.4s, v26.4s, v20.4s\n"
+    "ldr s26, [x22, %[input_col_stride1]]\n"
+    "fmla v12.4s, v30.4s, v0.4s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla v8.4s, v30.4s, v3.4s\n"
+    "prfm pldl1keep, [x23, x18]\n"
+    "fmla v11.4s, v30.4s, v1.4s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla v16.4s, v30.4s, v5.4s\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmla v7.4s, v30.4s, v4.4s\n"
+    "add %[wbptr], %[wbptr], #40\n"
+    "fmla v10.4s, v30.4s, v2.4s\n"
+    "prfm pldl1keep, [%[wbptr], #64]\n"
+    "fmla v15.4s, v30.4s, v6.4s\n"
+    "fmla v17.4s, v30.4s, v19.4s\n"
+    "fmla v18.4s, v30.4s, v20.4s\n"
+    "ldr s27, [x20, x13]\n"
+    "fmla v11.4s, v22.4s, v3.4s\n"
+    "fmla v7.4s, v22.4s, v5.4s\n"
+    "fmla v10.4s, v22.4s, v4.4s\n"
+    "fmla v17.4s, v22.4s, v6.4s\n"
+    "fmla v9.4s, v22.4s, v19.4s\n"
+    "fmla v14.4s, v24.4s, v2.4s\n"
+    "mov v25.16b, v23.16b\n"
+    "fmla v16.4s, v26.4s, v1.4s\n"
+    "fmla v10.4s, v21.4s, v5.4s\n"
+    "fmla v15.4s, v26.4s, v2.4s\n"
+    "fmla v25.4s, v22.4s, v20.4s\n"
+    "ldr s28, [x9, x19]\n"
+    "fmla v9.4s, v21.4s, v6.4s\n"
+    "ldr s29, [x24, x17]\n"
+    "fmla v14.4s, v26.4s, v4.4s\n"
+    "fmla v13.4s, v26.4s, v19.4s\n"
+    "mov v22.16b, v23.16b\n"
+    "fmla v8.4s, v27.4s, v0.4s\n"
+    "fmla v16.4s, v27.4s, v3.4s\n"
+    "fmla v7.4s, v27.4s, v1.4s\n"
+    "fmla v14.4s, v27.4s, v5.4s\n"
+    "fmla v15.4s, v27.4s, v4.4s\n"
+    "fmla v17.4s, v27.4s, v2.4s\n"
+    "fmla v13.4s, v27.4s, v6.4s\n"
+    "fmla v18.4s, v27.4s, v19.4s\n"
+    "fmla v22.4s, v27.4s, v20.4s\n"
+    "mov v24.16b, v23.16b\n"
+    "mov v21.16b, v23.16b\n"
+    "fmla v11.4s, v28.4s, v0.4s\n"
+    "fmla v7.4s, v28.4s, v3.4s\n"
+    "fmla v10.4s, v28.4s, v1.4s\n"
+    "fmla v15.4s, v28.4s, v5.4s\n"
+    "fmla v17.4s, v28.4s, v4.4s\n"
+    "fmla v9.4s, v28.4s, v2.4s\n"
+    "fmla v18.4s, v28.4s, v6.4s\n"
+    "fmla v25.4s, v28.4s, v19.4s\n"
+    "fmla v24.4s, v28.4s, v20.4s\n"
+    "ldr s23, [%[inptr0], x11]\n"
+    "fmla v10.4s, v29.4s, v3.4s\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v17.4s, v29.4s, v5.4s\n"
+    "fmla v9.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v6.4s\n"
+    "ldr s30, [x23, %[input_col_stride1]]\n"
+    "fmla v14.4s, v30.4s, v1.4s\n"
+    "fmla v13.4s, v30.4s, v2.4s\n"
+    "fmla v9.4s, v23.4s, v5.4s\n"
+    "ldr s23, [x22, x13]\n"
+    "fmla v16.4s, v23.4s, v0.4s\n"
+    "ldr s29, [x20, x19]\n"
+    "fmla v14.4s, v23.4s, v3.4s\n"
+    "fmla v15.4s, v23.4s, v1.4s\n"
+    "fmla v13.4s, v23.4s, v4.4s\n"
+    "fmla v18.4s, v23.4s, v2.4s\n"
+    "fmla v22.4s, v23.4s, v19.4s\n"
+    "ldr s23, [x9, x17]\n"
+    "fmla v7.4s, v29.4s, v0.4s\n"
+    "fmla v15.4s, v29.4s, v3.4s\n"
+    "fmla v17.4s, v29.4s, v1.4s\n"
+    "fmla v13.4s, v29.4s, v5.4s\n"
+    "fmla v18.4s, v29.4s, v4.4s\n"
+    "fmla v25.4s, v29.4s, v2.4s\n"
+    "fmla v22.4s, v29.4s, v6.4s\n"
+    "fmla v24.4s, v29.4s, v19.4s\n"
+    "fmla v21.4s, v29.4s, v20.4s\n"
+    "ldr s26, [x24, x11]\n"
+    "fmla v10.4s, v23.4s, v0.4s\n"
+    "ldr s28, [x23, x13]\n"
+    "fmla v17.4s, v23.4s, v3.4s\n"
+    "add x24, x24, #4\n"
+    "fmla v9.4s, v23.4s, v1.4s\n"
+    "fmla v18.4s, v23.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v4.4s\n"
+    "fmla v24.4s, v23.4s, v6.4s\n"
+    "fmla v14.4s, v28.4s, v0.4s\n"
+    "ldr s20, [x22, x19]\n"
+    "fmla v9.4s, v26.4s, v3.4s\n"
+    "fmla v13.4s, v28.4s, v1.4s\n"
+    "fmla v25.4s, v26.4s, v5.4s\n"
+    "ldr s26, [x20, x17]\n"
+    "fmla v22.4s, v28.4s, v2.4s\n"
+    "ldr s23, [x9, x11]\n"
+    "fmla v15.4s, v20.4s, v0.4s\n"
+    "add x9, x9, #4\n"
+    "fmla v13.4s, v20.4s, v3.4s\n"
+    "fmla v18.4s, v20.4s, v1.4s\n"
+    "fmla v22.4s, v20.4s, v4.4s\n"
+    "fmla v24.4s, v20.4s, v2.4s\n"
+    "fmla v21.4s, v20.4s, v19.4s\n"
+    "ldr s27, [x23, x19]\n"
+    "fmla v17.4s, v26.4s, v0.4s\n"
+    "ldr s20, [x22, x17]\n"
+    "fmla v18.4s, v26.4s, v3.4s\n"
+    "fmla v25.4s, v26.4s, v1.4s\n"
+    "fmla v22.4s, v26.4s, v5.4s\n"
+    "fmla v24.4s, v26.4s, v4.4s\n"
+    "fmla v21.4s, v26.4s, v6.4s\n"
+    "ldr s19, [x20, x11]\n"
+    "fmla v9.4s, v23.4s, v0.4s\n"
+    "ldr s28, [x23, x17]\n"
+    "fmla v25.4s, v23.4s, v3.4s\n"
+    "add x20, x20, #4\n"
+    "fmla v24.4s, v23.4s, v5.4s\n"
+    "ldr s29, [x22, x11]\n"
+    "fmla v13.4s, v27.4s, v0.4s\n"
+    "add x22, x22, #4\n"
+    "fmla v22.4s, v27.4s, v1.4s\n"
+    "fmla v21.4s, v27.4s, v2.4s\n"
+    "fmla v18.4s, v20.4s, v0.4s\n"
+    "ldr s30, [x23, x11]\n"
+    "fmla v24.4s, v20.4s, v1.4s\n"
+    "add x23, x23, #4\n"
+    "fmla v22.4s, v20.4s, v3.4s\n"
+    "fmla v21.4s, v20.4s, v4.4s\n"
+    "fmla v25.4s, v19.4s, v0.4s\n"
+    "movi v26.16b, #0\n"
+    "fmla v24.4s, v19.4s, v3.4s\n"
+    "fmov v27.4s, #6.0\n"
+    "fmla v21.4s, v19.4s, v5.4s\n"
+    "fmla v22.4s, v28.4s, v0.4s\n"
+    "fmax v12.4s, v12.4s, v26.4s\n"
+    "fmax v11.4s, v11.4s, v26.4s\n"
+    "fmla v24.4s, v29.4s, v0.4s\n"
+    "fmax v10.4s, v10.4s, v26.4s\n"
+    "fmla v21.4s, v28.4s, v1.4s\n"
+    "fmin v12.4s, v12.4s, v27.4s\n"
+    "fmin v11.4s, v11.4s, v27.4s\n"
+    "fmin v10.4s, v10.4s, v27.4s\n"
+    "str s12, [%[outptr0]]\n"
+    "fmax v9.4s, v9.4s, v26.4s\n"
+    "str s11, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v21.4s, v29.4s, v3.4s\n"
+    "str s10, [%[outptr0], x27]\n"
+    "fmin v9.4s, v9.4s, v27.4s\n"
+    "fmax v8.4s, v8.4s, v26.4s\n"
+    "fmax v7.4s, v7.4s, v26.4s\n"
+    "str s9, [%[outptr0], x28]\n"
+    "fmla v21.4s, v30.4s, v0.4s\n"
+    "fmin v8.4s, v8.4s, v27.4s\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "fmin v7.4s, v7.4s, v27.4s\n"
+    "fmax v17.4s, v17.4s, v26.4s\n"
+    "str s8, [x8]\n"
+    "fmax v25.4s, v25.4s, v26.4s\n"
+    "str s7, [x8, %[output_col_stride1]]\n"
+    "fmin v17.4s, v17.4s, v27.4s\n"
+    "fmin v25.4s, v25.4s, v27.4s\n"
+    "fmax v16.4s, v16.4s, v26.4s\n"
+    "str s17, [x8, x27]\n"
+    "fmax v15.4s, v15.4s, v26.4s\n"
+    "str s25, [x8, x28]\n"
+    "fmin v16.4s, v16.4s, v27.4s\n"
+    "fmin v15.4s, v15.4s, v27.4s\n"
+    "add x8, x8, #4\n"
+    "str s16, [x25]\n"
+    "fmax v18.4s, v18.4s, v26.4s\n"
+    "str s15, [x25, %[output_col_stride1]]\n"
+    "fmax v24.4s, v24.4s, v26.4s\n"
+    "fmin v18.4s, v18.4s, v27.4s\n"
+    "fmax v14.4s, v14.4s, v26.4s\n"
+    "fmin v24.4s, v24.4s, v27.4s\n"
+    "fmax v13.4s, v13.4s, v26.4s\n"
+    "str s18, [x25, x27]\n"
+    "fmin v14.4s, v14.4s, v27.4s\n"
+    "str s24, [x25, x28]\n"
+    "fmin v13.4s, v13.4s, v27.4s\n"
+    "str s14, [x26]\n"
+    "fmax v22.4s, v22.4s, v26.4s\n"
+    "str s13, [x26, %[output_col_stride1]]\n"
+    "fmax v21.4s, v21.4s, v26.4s\n"
+    "fmin v22.4s, v22.4s, v27.4s\n"
+    "add x25, x25, #4\n"
+    "fmin v21.4s, v21.4s, v27.4s\n"
+    "str s22, [x26, x27]\n"
+    "str s21, [x26, x28]\n"
+    "add x26, x26, #4\n"
+    "7:\n"
+    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+  );
 }
 
 #endif  // __aarch64__
 
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
 
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
deleted file mode 100644
index 8f22a64..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_u8_s32.hpp"
-
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
deleted file mode 100644
index 09722d0..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 05315ee..a04609d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,142 +25,5 @@
 
 namespace depthwise
 {
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
-  },
-  {
-    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
-    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
-  },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
deleted file mode 100644
index cf51550..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_u8_s32.hpp"
-
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
-        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
-        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
-        },
-        {
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
-                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
-        },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
new file mode 100644
index 0000000..692086c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+// TODO Move to common utilities somewhere
+template <size_t Size> struct DType { };
+template <> struct DType<1> { using scalar_type = uint8_t; };
+template <> struct DType<2> { using scalar_type = uint16_t; };
+template <> struct DType<4> { using scalar_type = uint32_t; };
+
+namespace depthwise
+{
+
+template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
+void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute(
+  unsigned int n_channels,
+  void *buffer,
+  const void *weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void *biases
+)
+{
+  using TWeight = typename DType<WeightSize>::scalar_type;
+  using TBias = typename DType<BiasSize>::scalar_type;
+
+  auto buffer_ptr = static_cast<uint8_t *>(buffer);
+  auto weights_ptr = static_cast<const TWeight *>(weights);
+  auto biases_ptr = static_cast<const TBias *>(biases);
+
+  const unsigned int veclen = 16 / WeightSize;
+  for (; n_channels >= veclen; n_channels -= veclen)
+  {
+    // Copy biases
+    for (unsigned int i = 0; i < veclen; i++)
+    {
+      auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
+      *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
+      buffer_ptr += BiasSize;
+    }
+
+    // Copy weights
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelColumns; j++)
+      {
+        for (unsigned int c = 0; c < veclen; c++)
+        {
+          *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c];
+          buffer_ptr += WeightSize;
+        }
+      }
+    }
+    weights_ptr += veclen;
+  }
+  for (; n_channels; n_channels--)
+  {
+    // Copy bias
+    auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
+    *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
+    buffer_ptr += BiasSize;
+
+    // Copy weights
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelColumns; j++)
+      {
+        *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride];
+        buffer_ptr += WeightSize;
+      }
+    }
+    weights_ptr++;
+  }
+}
+
+template struct PackParameters<3, 3, 2ul, 2ul>;
+template struct PackParameters<3, 3, 4ul, 4ul>;
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
new file mode 100644
index 0000000..1989f87
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_qa8_qa8.hpp"
+
+namespace depthwise
+{
+template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>;
+template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>;
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index dacfb24..cbdb19a 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,254 +35,206 @@
 
 #pragma once
 
+using namespace neon_convolution_kernels;
+
 namespace depthwise
 {
-// Partial specialisation for FP16 to FP16
-template <int OutputTileRows, int OutputTileCols,
-          int KernelRows, int KernelCols,
-          int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
-{
-  typedef DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t
-  > DWC;
 
-  template <
-    bool Specialize=false,  // Specialize (or not) the method
-    int InPadTop=0,         // If specialized, top padding
-    int InPadLeft=0,        // If specialized, left padding
-    int InPadBottom=0,      // If specialized, bottom padding
-    int InPadRight=0,       // If specialized, right padding
-    int OutPadBottom=0,     // If specialized, bottom output padding
-    int OutPadRight=0       // If specialized, bottom right padding
-  >
-  static void process_tile(
-    const int n_channels,
-    const float16_t* const weights,
-    const int weight_row_stride,
-    const int weight_col_stride,
-    const float16_t* const inptr,
-    const int in_row_stride,
-    const int in_col_stride,
-    float16_t* const outptr,
-    const int out_row_stride,
-    const int out_col_stride,
-    const int in_pad_top=0,
-    const int in_pad_left=0,
-    const int in_pad_bottom=0,
-    const int in_pad_right=0,
-    const int out_pad_bottom=0,
-    const int out_pad_right=0,
-    const int input_offset=0,
-    const int weights_offset=0
-  );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
 template <
-  bool Specialize,
-  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-  int OutPadBottom, int OutPadRight
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
-  const int n_channels,
-  const float16_t *__restrict__ const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float16_t *__restrict__ const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float16_t *__restrict__ const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int _in_pad_top,
-  const int _in_pad_left,
-  const int _in_pad_bottom,
-  const int _in_pad_right,
-  const int _out_pad_bottom,
-  const int _out_pad_right,
-  const int _input_offset,
-  const int _weights_offset
+DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+>::DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  ActivationFunction activation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+>::execute_tile(
+  int n_channels,
+  const void *weights_biases_ptr,
+  const float16_t *input,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  float16_t *output,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
-
-  // Extract parameters
-  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
   // Instantiate pointers
-  const float16_t* __restrict__ inptr_base = inptr;
-  const float16_t* __restrict__ wptr_base = weights;
-    float16_t* __restrict__ outptr_base = outptr;
+  const float16_t* __restrict__ inptr_base = input;
+  float16_t* __restrict__ outptr_base = output;
+  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
 
   // Perform the depthwise convolution
   int channels_remaining = n_channels;
-#ifdef __aarch64__
   for (; channels_remaining >= 8; channels_remaining -= 8)
   {
     // Load input tile
-    float16x8_t u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = vdupq_n_f16(0.0f);
-        }
-        else
-        {
-          u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
       }
     }
     inptr_base += 8;
 
     // Load weights tile
-    float16x8_t w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float16x8_t vbias = vld1q_f16(params);
+    params += 8;
+
+    float16x8_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
+        w[i][j] = vld1q_f16(params);
+        params += 8;
       }
     }
-    wptr_base += 8;
 
     // Perform the convolution
-    float16x8_t v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float16x8_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
+        v[out_i][out_j] = vbias;
+
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
-            const int j = base_j + in_j;
-            if (in_i == 0 && in_j == 0)
-            {
-              // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
-            }
-            else
-            {
-              // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
-            }
+            const unsigned int j = base_j + in_j;
+
+            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
       }
     }
     outptr_base += 8;
   }
-#endif  // __aarch64__
   for (; channels_remaining; channels_remaining--)
   {
     // Load input tile
-    float16_t u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<float16_t>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = *(inptr_row + j*in_col_stride);
       }
     }
     inptr_base++;
 
     // Load weights tile
-    float16_t w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float16_t bias = *(params++);
+    float16_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
+        w[i][j] = *(params++);
       }
     }
-    wptr_base++;
 
     // Perform the convolution
-    float16_t v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float16_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
         // Clear the accumulator
-        v[out_i][out_j] = static_cast<float16_t>(0);
+        v[out_i][out_j] = bias;
 
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
             const int j = base_j + in_j;
             v[out_i][out_j] += w[in_i][in_j] * u[i][j];
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         *(outptr_row + j*out_col_stride) = v[i][j];
       }
@@ -290,5 +242,173 @@
     outptr_base++;
   }
 }
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+>::execute_tile(
+  int n_channels,
+  const void *weights_biases_ptr,
+  const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  // Instantiate pointers
+  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
+  int n = 0;
+
+  // Perform the depthwise convolution
+  int channels_remaining = n_channels;
+  for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
+  {
+    // Load input tile
+    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        u[i][j] = vld1q_f16(inptrs[i][j] + n);
+      }
+    }
+
+    // Load weights tile
+    float16x8_t vbias = vld1q_f16(params);
+    params += 8;
+
+    float16x8_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        w[i][j] = vld1q_f16(params);
+        params += 8;
+      }
+    }
+
+    // Perform the convolution
+    float16x8_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+    {
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+      {
+        v[out_i][out_j] = vbias;
+
+        // Base co-ordinate
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
+
+        // Fill the accumulator
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+        {
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+          {
+            const unsigned int j = base_j + in_j;
+
+            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+          }
+        }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
+        }
+      }
+    }
+
+    // Store the output tile
+    for (unsigned int i = 0; i < OutputTileRows; i++)
+    {
+      for (unsigned int j = 0; j < OutputTileCols; j++)
+      {
+        vst1q_f16(outptrs[i][j] + n, v[i][j]);
+      }
+    }
+  }
+  for (; channels_remaining; channels_remaining--, n++)
+  {
+    // Load input tile
+    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        u[i][j] = *(inptrs[i][j] + n);
+      }
+    }
+
+    // Load weights tile
+    float16_t bias = *(params++);
+    float16_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        w[i][j] = *(params++);
+      }
+    }
+
+    // Perform the convolution
+    float16_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+    {
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+      {
+        // Clear the accumulator
+        v[out_i][out_j] = bias;
+
+        // Base co-ordinate
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
+
+        // Fill the accumulator
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+        {
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+          {
+            const int j = base_j + in_j;
+            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+          }
+        }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
+        }
+      }
+    }
+
+    // Store the output tile
+    for (unsigned int i = 0; i < OutputTileRows; i++)
+    {
+      for (unsigned int j = 0; j < OutputTileCols; j++)
+      {
+        *(outptrs[i][j] + n) = v[i][j];
+      }
+    }
+  }
+}
+
 }  // namespace depthwise
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 840086f..2645761 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,254 +35,207 @@
 
 #pragma once
 
+using namespace neon_convolution_kernels;
+
 namespace depthwise
 {
-// Partial specialisation for FP32 to FP32
-template <int OutputTileRows, int OutputTileCols,
-          int KernelRows, int KernelCols,
-          int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
-{
-  typedef DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float
-  > DWC;
 
-  template <
-    bool Specialize=false,  // Specialize (or not) the method
-    int InPadTop=0,         // If specialized, top padding
-    int InPadLeft=0,        // If specialized, left padding
-    int InPadBottom=0,      // If specialized, bottom padding
-    int InPadRight=0,       // If specialized, right padding
-    int OutPadBottom=0,     // If specialized, bottom output padding
-    int OutPadRight=0       // If specialized, bottom right padding
-  >
-  static void process_tile(
-    const int n_channels,
-    const float* const weights,
-    const int weight_row_stride,
-    const int weight_col_stride,
-    const float* const inptr,
-    const int in_row_stride,
-    const int in_col_stride,
-    float* const outptr,
-    const int out_row_stride,
-    const int out_col_stride,
-    const int in_pad_top=0,
-    const int in_pad_left=0,
-    const int in_pad_bottom=0,
-    const int in_pad_right=0,
-    const int out_pad_bottom=0,
-    const int out_pad_right=0,
-    const int input_offset=0,
-    const int weights_offset=0
-  );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
 template <
-  bool Specialize,
-  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-  int OutPadBottom, int OutPadRight
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
 >
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
-  const int n_channels,
-  const float *__restrict__ const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const float *__restrict__ const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  float *__restrict__ const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int _in_pad_top,
-  const int _in_pad_left,
-  const int _in_pad_bottom,
-  const int _in_pad_right,
-  const int _out_pad_bottom,
-  const int _out_pad_right,
-  const int _input_offset,
-  const int _weights_offset
+DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::DepthwiseConvolution(
+  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+  ActivationFunction activation,
+  unsigned int padding_top,
+  unsigned int padding_left,
+  unsigned int padding_bottom,
+  unsigned int padding_right
+) : Base(
+      n_batches, n_input_rows, n_input_cols, n_channels, activation,
+      padding_top, padding_left, padding_bottom, padding_right
+    )
+{
+}
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::execute_tile(
+  int n_channels,
+  const void *weights_biases_ptr,
+  const float *input,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  float *output,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
-
-  // Extract parameters
-  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
   // Instantiate pointers
-  const float* __restrict__ inptr_base = inptr;
-  const float* __restrict__ wptr_base = weights;
-  float* __restrict__ outptr_base = outptr;
+  const float* __restrict__ inptr_base = input;
+  float* __restrict__ outptr_base = output;
+  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
 
   // Perform the depthwise convolution
   int channels_remaining = n_channels;
-#ifdef __aarch64__
   for (; channels_remaining >= 4; channels_remaining -= 4)
   {
     // Load input tile
-    float32x4_t u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = vdupq_n_f32(0.0f);
-        }
-        else
-        {
-          u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
       }
     }
     inptr_base += 4;
 
     // Load weights tile
-    float32x4_t w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float32x4_t vbias = vld1q_f32(params);
+    params += 4;
+
+    float32x4_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+        w[i][j] = vld1q_f32(params);
+        params += 4;
       }
     }
-    wptr_base += 4;
 
     // Perform the convolution
-    float32x4_t v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float32x4_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
+        v[out_i][out_j] = vbias;
+
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
-            const int j = base_j + in_j;
-            if (in_i == 0 && in_j == 0)
-            {
-              // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
-            }
-            else
-            {
-              // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-              v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-            }
+            const unsigned int j = base_j + in_j;
+
+            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
       }
     }
     outptr_base += 4;
   }
-#endif  // __aarch64__
   for (; channels_remaining; channels_remaining--)
   {
     // Load input tile
-    float u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
+    float u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
     {
-      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
+      const float* const inptr_row = inptr_base + i*in_row_stride;
+      for (int j = 0; j < Base::inner_tile_cols; j++)
       {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<float>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
+        u[i][j] = *(inptr_row + j*in_col_stride);
       }
     }
     inptr_base++;
 
     // Load weights tile
-    float w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
+    float bias = *(params++);
+    float w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
     {
-      const float* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
+      for (unsigned int j = 0; j < KernelCols; j++)
       {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
+        w[i][j] = *(params++);
       }
     }
-    wptr_base++;
 
     // Perform the convolution
-    float v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    float v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
     {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
       {
         // Clear the accumulator
-        v[out_i][out_j] = static_cast<float>(0);
+        v[out_i][out_j] = bias;
 
         // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
 
         // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
         {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
           {
             const int j = base_j + in_j;
             v[out_i][out_j] += w[in_i][in_j] * u[i][j];
           }
         }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+        }
       }
     }
 
     // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
+    for (unsigned int i = 0; i < OutputTileRows; i++)
     {
       float* const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
+      for (unsigned int j = 0; j < OutputTileCols; j++)
       {
         *(outptr_row + j*out_col_stride) = v[i][j];
       }
@@ -291,4 +244,171 @@
   }
 }
 
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols, StrideRows, StrideCols,
+  float, float, float
+>::execute_tile(
+  int n_channels,
+  const void *weights_biases_ptr,
+  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
+
+  // Perform the depthwise convolution
+  int channels_remaining = n_channels;
+  int n = 0;
+  for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
+  {
+    // Load input tile
+    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        u[i][j] = vld1q_f32(inptrs[i][j] + n);
+      }
+    }
+
+    // Load weights tile
+    float32x4_t vbias = vld1q_f32(params);
+    params += 4;
+
+    float32x4_t w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        w[i][j] = vld1q_f32(params);
+        params += 4;
+      }
+    }
+
+    // Perform the convolution
+    float32x4_t v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+    {
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+      {
+        v[out_i][out_j] = vbias;
+
+        // Base co-ordinate
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
+
+        // Fill the accumulator
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+        {
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+          {
+            const unsigned int j = base_j + in_j;
+
+            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+          }
+        }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+        }
+      }
+    }
+
+    // Store the output tile
+    for (unsigned int i = 0; i < OutputTileRows; i++)
+    {
+      for (unsigned int j = 0; j < OutputTileCols; j++)
+      {
+        vst1q_f32(outptrs[i][j] + n, v[i][j]);
+      }
+    }
+  }
+  for (; channels_remaining; channels_remaining--, n++)
+  {
+    // Load input tile
+    float u[Base::inner_tile_rows][Base::inner_tile_cols];
+    for (int i = 0; i < Base::inner_tile_rows; i++)
+    {
+      for (int j = 0; j < Base::inner_tile_cols; j++)
+      {
+        u[i][j] = *(inptrs[i][j] + n);
+      }
+    }
+
+    // Load weights tile
+    float bias = *(params++);
+    float w[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        w[i][j] = *(params++);
+      }
+    }
+
+    // Perform the convolution
+    float v[OutputTileRows][OutputTileCols];
+    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+    {
+      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+      {
+        // Clear the accumulator
+        v[out_i][out_j] = bias;
+
+        // Base co-ordinate
+        const int base_i = out_i * StrideRows;
+        const int base_j = out_j * StrideCols;
+
+        // Fill the accumulator
+        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+        {
+          const unsigned int i = base_i + in_i;
+          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+          {
+            const int j = base_j + in_j;
+            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+          }
+        }
+
+        // Apply the activation function
+        if (Activation == ActivationFunction::ReLU ||
+            Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+        }
+        if (Activation == ActivationFunction::ReLU6)
+        {
+          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+        }
+      }
+    }
+
+    // Store the output tile
+    for (unsigned int i = 0; i < OutputTileRows; i++)
+    {
+      for (unsigned int j = 0; j < OutputTileCols; j++)
+      {
+        *(outptrs[i][j] + n) = v[i][j];
+      }
+    }
+  }
+}
+
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
new file mode 100644
index 0000000..5546d37
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include <limits>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+
+#pragma once
+
+using namespace neon_convolution_kernels;
+using namespace qasymm8;
+
+template <typename T>
+inline T saturating_doubling_high_mul(const T&, const int32_t&);
+
+template <>
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
+{
+  return vqrdmulhq_n_s32(a, b);
+}
+
+template <>
+inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
+{
+  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+template <typename T>
+inline T rounding_divide_by_exp2(const T& x, const int exponent);
+
+template <>
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
+{
+  const int32x4_t shift = vdupq_n_s32(-exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+  const int32x4_t fixed = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed, shift);
+}
+
+template <>
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
+{
+  const int32x2_t shift = vdup_n_s32(-exponent);
+  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+  const int32x2_t fixed = vqadd_s32(x, fixup);
+  return vrshl_s32(fixed, shift);
+}
+
+template <>
+inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
+{
+  const int32x2_t xs = vdup_n_s32(x);
+  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
+
+namespace depthwise
+{
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        const ActivationFunction activation,
+        const QAsymm8Params& weight_quantisation,
+        const QAsymm8Params& input_quantisation,
+        const QAsymm8Params& output_quantisation,
+        unsigned int padding_top,
+        unsigned int padding_left,
+        unsigned int padding_bottom,
+        unsigned int padding_right
+                              ) : QAsymm8DepthwiseConvolution(
+        n_batches, n_input_rows, n_input_cols, n_channels,
+        activation, weight_quantisation, input_quantisation, output_quantisation,
+        QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+        padding_top, padding_left, padding_bottom, padding_right
+)
+{
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+        const ActivationFunction activation,
+        const QAsymm8Params& weight_quantisation,
+        const QAsymm8Params& input_quantisation,
+        const QAsymm8Params& output_quantisation,
+        const QAsymm8RescaleParams& rescale_params,
+        unsigned int padding_top,
+        unsigned int padding_left,
+        unsigned int padding_bottom,
+        unsigned int padding_right
+                              ) : Base(
+        n_batches, n_input_rows, n_input_cols, n_channels, activation,
+        padding_top, padding_left, padding_bottom, padding_right
+),
+                                  _weights_quant(weight_quantisation),
+                                  _inputs_quant(input_quantisation),
+                                  _output_quant(output_quantisation),
+                                  rescale_parameters(rescale_params)
+{
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+uint8_t QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_input_padding_value(void) const
+{
+  return _inputs_quant.offset;
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+void QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_pack_params(
+        void * const buffer,
+        const void * const weights,
+        const unsigned int weight_row_stride,
+        const unsigned int weight_col_stride,
+        const void * const biases
+               ) const
+{
+  const uint8_t *wptr = static_cast<const uint8_t *>(weights);
+  const int32_t *bptr = static_cast<const int32_t *>(biases);
+  uint8_t *outptr = static_cast<uint8_t *>(buffer);
+
+  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
+  // For SVE set this to half the vector length.
+  unsigned int veclen = 8;
+
+  // While there are channels left to process, pack a vector length of them at
+  // a time and reduce the size of vector used as the size of the tensor
+  // decreases.
+  for (
+          unsigned int n_channels = this->n_channels(); n_channels;
+          n_channels -= veclen,
+                  outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
+          )
+  {
+    // NOTE Ignore this section if using SVE, the vector length remains the
+    // same and we just don't fill a full register for the tail.
+    while (n_channels < veclen)
+    {
+      // Reduce the vector length to either 8 or 1 (scalar)
+      // TODO Support more vector lengths in `execute_tile`.
+      veclen = (veclen == 16) ? 8 : 1;
+    }
+
+    // Get pointers to bias and weight portions of the output structure.
+    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
+    uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
+
+    // Copy a vector length of elements
+    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
+    {
+      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
+      out_bptr[n] = bias;
+
+      for (unsigned int i = 0; i < KernelRows; i++)
+      {
+        uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
+        for (unsigned int j = 0; j < KernelCols; j++)
+        {
+          uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
+          row_outptr[j*veclen + n] = w;
+        }
+      }
+      wptr++;
+    }
+  }
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols,
+        typename FInput, typename FOutput
+>
+static inline void tilefn(
+        int n_channels,
+        const void* packed_params,
+        FInput &get_input_ptr,
+        FOutput &get_output_ptr,
+        const int32_t clamp_max,
+        const int32_t clamp_min,
+        const uint8_t input_offset,
+        const uint8_t weight_offset,
+        const uint8_t output_offset,
+        const int32_t requant_multiplier,
+        const int32_t requant_shift
+                         )
+{
+  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
+  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
+
+  // Offset into channels
+  int channel = 0;
+
+  // Byte type pointer to weights and biases
+  const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
+
+  for (; n_channels >= 8; n_channels -= 8, channel += 8)
+  {
+    const int32x4_t biases[2] = {
+            vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+            vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+    };
+    wbptr += 8*sizeof(int32_t);
+
+    int16x8_t weights[KernelRows][KernelCols];
+    const uint8x8_t woffset = vdup_n_u8(weight_offset);
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        const uint8x8_t w = vld1_u8(wbptr);
+        weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
+        wbptr += 8;
+      }
+    }
+
+    int16x8_t inputs[InnerTileRows][InnerTileCols];
+    const uint8x8_t ioffset = vdup_n_u8(input_offset);
+    for (unsigned int i = 0; i < InnerTileRows; i++)
+    {
+      for (unsigned int j = 0; j < InnerTileCols; j++)
+      {
+        const auto x = vld1_u8(get_input_ptr(i, j, channel));
+        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
+      }
+    }
+
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        int32x4_t accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+          accs[i] = biases[i];
+        }
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const auto w = weights[wi][wj];
+            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
+            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
+            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
+          }
+        }
+
+        int32x4_t final_accs[2];
+        for (unsigned int i = 0; i < 2; i++)
+        {
+          const int32x4_t y = rounding_divide_by_exp2(
+                  saturating_doubling_high_mul(accs[i], requant_multiplier),
+                  requant_shift);
+          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
+          final_accs[i] = vaddq_s32(y, offset);
+          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
+          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
+        }
+
+        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
+                                         vreinterpretq_s16_s32(final_accs[1]));
+        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
+        const uint8x8_t output =
+                vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
+        vst1_u8(get_output_ptr(oi, oj, channel), output);
+      }
+    }
+  }
+  for (; n_channels; n_channels--, channel++)
+  {
+    // Load bias
+    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+    wbptr += sizeof(int32_t);
+
+    // Load weights
+    int16_t weights[KernelRows][KernelCols];
+    for (unsigned int i = 0; i < KernelRows; i++)
+    {
+      for (unsigned int j = 0; j < KernelCols; j++)
+      {
+        weights[i][j] = *(wbptr++) - weight_offset;
+      }
+    }
+
+    // Load the input activations
+    int16_t inputs[InnerTileRows][InnerTileCols];
+    for (unsigned int i = 0; i < InnerTileRows; i++)
+    {
+      for (unsigned int j = 0; j < InnerTileCols; j++)
+      {
+        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
+      }
+    }
+
+    // Perform the convolution
+    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+    {
+      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+      {
+        int32_t acc = bias;
+
+        for (unsigned int wi = 0; wi < KernelRows; wi++)
+        {
+          for (unsigned int wj = 0; wj < KernelCols; wj++)
+          {
+            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+            acc += w * x;
+          }
+        }
+
+        // Requantize
+        acc = rounding_divide_by_exp2(
+                saturating_doubling_high_mul(acc, requant_multiplier),
+                requant_shift);
+        acc += output_offset;
+        acc = std::max(acc, clamp_min);
+        acc = std::min(acc, clamp_max);
+        uint8_t output = static_cast<uint8_t>(acc);
+        *(get_output_ptr(oi, oj, channel)) = output;
+      }
+    }
+  }
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols,
+        typename FInput, typename FOutput
+>
+static inline void execute_tilefn(
+        int n_channels,
+        const void* packed_params,
+        const nck::ActivationFunction actfn,
+        FInput &get_input_ptr,
+        FOutput &get_output_ptr,
+        const QAsymm8Params &input_quant,
+        const QAsymm8Params &weight_quant,
+        const QAsymm8Params &output_quant,
+        const QAsymm8RescaleParams &requant
+                                 ) {
+  // Compute min/max clamp values
+  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
+  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
+
+  if (actfn == nck::ActivationFunction::ReLU ||
+      actfn == nck::ActivationFunction::ReLU6) {
+    const int32_t bottom_rail = output_quant.offset;
+    clamp_min = std::max(clamp_min, bottom_rail);
+  }
+
+  if (actfn == nck::ActivationFunction::ReLU6) {
+    const int32_t top_rail = output_quant.quantize(6.0f);
+    clamp_max = std::min(clamp_max, top_rail);
+  }
+
+  // Call the tile execution method
+  tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+          StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
+                      clamp_max, clamp_min, input_quant.offset,
+                      weight_quant.offset, output_quant.offset,
+                      requant.multiplier, requant.shift);
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+template <nck::ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+        int n_channels,
+        const void* packed_params,
+        const uint8_t* inptr,
+        unsigned int in_row_stride,
+        unsigned int in_col_stride,
+        uint8_t* outptr,
+        unsigned int out_row_stride,
+        unsigned int out_col_stride
+               ) {
+  // Construct methods to get pointers
+  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
+          const int i, const int j, const int channel) {
+      return inptr + i * in_row_stride + j * in_col_stride + channel;
+  };
+
+  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
+          const int i, const int j, const int channel) {
+      return outptr + i * out_row_stride + j * out_col_stride + channel;
+  };
+
+  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+          StrideRows, StrideCols>(
+          n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
+          _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+}
+
+template <
+        unsigned int OutputTileRows, unsigned int OutputTileCols,
+        unsigned int KernelRows, unsigned int KernelCols,
+        unsigned int StrideRows, unsigned int StrideCols
+>
+template <nck::ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+        OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+        int n_channels,
+        const void* packed_params,
+        const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+        uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+               ) {
+  // Construct methods to get pointers
+  const auto get_input_ptr = [inptrs](const int i, const int j,
+                                      const int channel) {
+      return inptrs[i][j] + channel;
+  };
+
+  const auto get_output_ptr = [outptrs](const int i, const int j,
+                                        const int channel) {
+      return outptrs[i][j] + channel;
+  };
+
+  // Call the tile execution method
+  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+          StrideRows, StrideCols>(
+          n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
+          _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+}
+
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
deleted file mode 100644
index d0d8de5..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
-
-#pragma once
-
-namespace depthwise
-{
-// Partial specialisation for U8 to S32
-template <int OutputTileRows, int OutputTileCols,
-        int KernelRows, int KernelCols,
-        int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
-{
-    typedef DepthwiseConvolution<
-            OutputTileRows, OutputTileCols,
-            KernelRows, KernelCols,
-            StrideRows, StrideCols,
-            uint8_t, int32_t
-    > DWC;
-
-    template <
-            bool Specialize=false,  // Specialize (or not) the method
-            int InPadTop=0,         // If specialized, top padding
-            int InPadLeft=0,        // If specialized, left padding
-            int InPadBottom=0,      // If specialized, bottom padding
-            int InPadRight=0,       // If specialized, right padding
-            int OutPadBottom=0,     // If specialized, bottom output padding
-            int OutPadRight=0       // If specialized, bottom right padding
-    >
-    static void process_tile(
-            const int n_channels,
-            const uint8_t* const weights,
-            const int weight_row_stride,
-            const int weight_col_stride,
-            const uint8_t* const inptr,
-            const int in_row_stride,
-            const int in_col_stride,
-            int32_t* const outptr,
-            const int out_row_stride,
-            const int out_col_stride,
-            const int in_pad_top=0,
-            const int in_pad_left=0,
-            const int in_pad_bottom=0,
-            const int in_pad_right=0,
-            const int out_pad_bottom=0,
-            const int out_pad_right=0,
-            const int input_offset=0,
-            const int weights_offset=0);
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
-template <
-        bool Specialize,
-        int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-        int OutPadBottom, int OutPadRight
->
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
-        const int n_channels,
-        const uint8_t *__restrict__ const weights,
-        const int weight_row_stride,
-        const int weight_col_stride,
-        const uint8_t *__restrict__ const inptr,
-        const int in_row_stride,
-        const int in_col_stride,
-        int32_t *__restrict__ const outptr,
-        const int out_row_stride,
-        const int out_col_stride,
-        const int _in_pad_top,
-        const int _in_pad_left,
-        const int _in_pad_bottom,
-        const int _in_pad_right,
-        const int _out_pad_bottom,
-        const int _out_pad_right,
-        const int _input_offset,
-        const int _weights_offset
-)
-{
-    constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-    constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-    constexpr auto kernel_rows = DWC::kernel_rows;
-    constexpr auto kernel_cols = DWC::kernel_cols;
-    constexpr auto output_tile_rows = DWC::output_tile_rows;
-    constexpr auto output_tile_cols = DWC::output_tile_cols;
-    constexpr auto stride_rows = DWC::stride_rows;
-    constexpr auto stride_cols = DWC::stride_cols;
-
-    // Extract parameters
-    const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-    const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-    const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-    const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-    const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-    const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-    // Compute valid ranges of the tile
-    const int in_cells_i = inner_tile_rows - in_pad_bottom;
-    const int in_cells_j = inner_tile_cols - in_pad_right;
-    const int out_cells_i = output_tile_rows - out_pad_bottom;
-    const int out_cells_j = output_tile_cols - out_pad_right;
-
-    // Instantiate pointers
-    const uint8_t* __restrict__ inptr_base = inptr;
-    const uint8_t* __restrict__ wptr_base = weights;
-    int32_t* __restrict__ outptr_base = outptr;
-
-    // Perform the depthwise convolution
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
-    const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
-    for (; channels_remaining >= 16; channels_remaining -= 16)
-    {
-        // Load input tile
-        int32x4x4_t u[inner_tile_rows][inner_tile_cols];
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                if (i < in_pad_top || in_cells_i <= i ||
-                    j < in_pad_left || in_cells_j <= j)
-                {
-                    u[i][j].val[0] = vdupq_n_s32(0);
-                    u[i][j].val[1] = vdupq_n_s32(0);
-                    u[i][j].val[2] = vdupq_n_s32(0);
-                    u[i][j].val[3] = vdupq_n_s32(0);
-                }
-                else
-                {
-                    const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
-                    u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
-                    u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
-                    u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
-                    u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
-                }
-            }
-        }
-        inptr_base += 16;
-
-        // Load weights tile
-        int32x4x4_t w[kernel_rows][kernel_cols];
-        for (int i = 0; i < kernel_rows; i++)
-        {
-            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
-            for (int j = 0; j < kernel_cols; j++)
-            {
-                const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
-                w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
-                w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
-                w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
-                w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
-            }
-        }
-        wptr_base += 16;
-
-        // Perform the convolution
-        int32x4x4_t v[output_tile_rows][output_tile_cols];
-        for (int out_i = 0; out_i < out_cells_i; out_i++)
-        {
-            for (int out_j = 0; out_j < out_cells_j; out_j++)
-            {
-                // Base co-ordinate
-                const int base_i = out_i * stride_rows;
-                const int base_j = out_j * stride_cols;
-
-                // Fill the accumulator
-                for (int in_i = 0; in_i < kernel_rows; in_i++)
-                {
-                    const int i = base_i + in_i;
-                    for (int in_j = 0; in_j < kernel_cols; in_j++)
-                    {
-                        const int j = base_j + in_j;
-                        if (in_i == 0 && in_j == 0)
-                        {
-                            // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
-                            v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
-                            v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
-                            v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
-                            v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
-                        }
-                        else
-                        {
-                            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-                            v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
-                            v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
-                            v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
-                            v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Store the output tile
-        for (int i = 0; i < out_cells_i; i++)
-        {
-            int32_t* const outptr_row = outptr_base + i*out_row_stride;
-            for (int j = 0; j < out_cells_j; j++)
-            {
-                vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
-                vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
-                vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
-                vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
-            }
-        }
-        outptr_base += 16;
-    }
-#endif  // __aarch64__
-    for (; channels_remaining; channels_remaining--)
-    {
-        // Load input tile
-        int32_t u[inner_tile_rows][inner_tile_cols];
-        for (int i = 0; i < inner_tile_rows; i++)
-        {
-            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-            for (int j = 0; j < inner_tile_cols; j++)
-            {
-                if (i < in_pad_top || in_cells_i <= i ||
-                    j < in_pad_left || in_cells_j <= j)
-                {
-                    u[i][j] = static_cast<uint8_t>(0);
-                }
-                else
-                {
-                    u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
-                }
-            }
-        }
-        inptr_base++;
-
-        // Load weights tile
-        int32_t w[kernel_rows][kernel_cols];
-        for (int i = 0; i < kernel_rows; i++)
-        {
-            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
-            for (int j = 0; j < kernel_cols; j++)
-            {
-                w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
-            }
-        }
-        wptr_base++;
-
-        // Perform the convolution
-        int32_t v[output_tile_rows][output_tile_cols];
-        for (int out_i = 0; out_i < out_cells_i; out_i++)
-        {
-            for (int out_j = 0; out_j < out_cells_j; out_j++)
-            {
-                // Clear the accumulator
-                v[out_i][out_j] = static_cast<int32_t>(0);
-
-                // Base co-ordinate
-                const int base_i = out_i * stride_rows;
-                const int base_j = out_j * stride_cols;
-
-                // Fill the accumulator
-                for (int in_i = 0; in_i < kernel_rows; in_i++)
-                {
-                    const int i = base_i + in_i;
-                    for (int in_j = 0; in_j < kernel_cols; in_j++)
-                    {
-                        const int j = base_j + in_j;
-                        v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-                    }
-                }
-            }
-        }
-
-        // Store the output tile
-        for (int i = 0; i < out_cells_i; i++)
-        {
-            int32_t* const outptr_row = outptr_base + i*out_row_stride;
-            for (int j = 0; j < out_cells_j; j++)
-            {
-                *(outptr_row + j*out_col_stride) = v[i][j];
-            }
-        }
-        outptr_base++;
-    }
-}
-
-}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
deleted file mode 100644
index ac83bf9..0000000
--- a/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
-
-using namespace winograd;
-
-template <const int MB, const int NB, typename TIn, typename TOut>
-BatchedBlockedGemm<MB, NB, TIn, TOut>::BatchedBlockedGemm(
-  const unsigned int n_gemms,
-  const int M, const int K, const int N,
-  const int a_matrix_stride,
-  const int a_row_stride,
-  const int b_matrix_stride,
-  const int b_row_stride,
-  const int c_matrix_stride,
-  const int c_row_stride,
-  const TIn* const a_ptr,
-  const TIn* const b_ptr,
-  TOut* const c_ptr
-) : n_gemms(n_gemms), M(M), N(N), K(K),
-    a_matrix_stride(a_matrix_stride),
-    a_row_stride(a_row_stride),
-    b_matrix_stride(b_matrix_stride),
-    b_row_stride(b_row_stride),
-    c_matrix_stride(c_matrix_stride),
-    c_row_stride(c_row_stride),
-    a_ptr(a_ptr), b_ptr(b_ptr), c_ptr(c_ptr)
-{
-}
-
-template <const int MBlock, const int NBlock, typename TIn, typename TOut>
-unsigned int BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::get_window() const
-{
-  return n_gemms;
-}
-
-template <const int MBlock, const int NBlock, typename TIn, typename TOut>
-void BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::run(
-  const unsigned int start, const unsigned int stop
-)
-{
-  // Perform the specified GEMMs
-  for (unsigned int i = start; i < stop; i++)
-  {
-    // Get pointers to the relevant matrices
-    const TIn* const mtr_a = a_ptr + i*a_matrix_stride;
-    const TIn* const mtr_b = b_ptr + i*b_matrix_stride;
-    TOut* const mtr_c = c_ptr + i*c_matrix_stride;
-
-    // Perform the GEMM
-    BlockedGemm<MBlock, NBlock, TIn, TOut>(
-      mtr_a, mtr_b, mtr_c, M, K, N,
-      a_row_stride, b_row_stride, c_row_stride
-    );
-  }
-}
-
-template class winograd::BatchedBlockedGemm<4, 16, float, float>;
-
diff --git a/src/core/NEON/kernels/convolution/winograd/padding.cpp b/src/core/NEON/kernels/convolution/winograd/padding.cpp
new file mode 100644
index 0000000..46fe57c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/padding.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cstring>
+#include <cstdint>
+
+#include "padding.hpp"
+
+namespace padding
+{
+
+template <typename T>
+void copy_and_pad_tile(
+  const unsigned int tile_rows,
+  const unsigned int tile_cols,
+  const unsigned int n_channels,
+  const T* const inptr,
+  const unsigned int in_row_stride,
+  const unsigned int in_col_stride,
+  T* const outptr,
+  const unsigned int out_row_stride,
+  const unsigned int out_col_stride,
+  const unsigned int pad_top,
+  const unsigned int pad_left,
+  const unsigned int pad_bottom,
+  const unsigned int pad_right,
+  const T pad_value
+)
+{
+  for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
+  {
+    for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
+    {
+      T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
+
+      if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
+          out_j < pad_left || tile_cols - pad_right <= out_j)
+      {
+        for (unsigned int n = 0; n < n_channels; n++)
+        {
+          output[n] = pad_value;
+        }
+      }
+      else
+      {
+        const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
+        const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
+        std::memcpy(output, input, n_channels * sizeof(T));
+      }
+    }
+  }
+}
+
+template void copy_and_pad_tile(
+  unsigned int, unsigned int, unsigned int,
+  const uint8_t *, unsigned int, unsigned int,
+  uint8_t *, unsigned int, unsigned int,
+  unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
+);
+
+template void copy_and_pad_tile(
+  unsigned int, unsigned int, unsigned int,
+  const float *, unsigned int, unsigned int,
+  float *, unsigned int, unsigned int,
+  unsigned int, unsigned int, unsigned int, unsigned int, float
+);
+
+template <unsigned int TileRows, unsigned int TileCols>
+void CopyCropped<TileRows, TileCols>::execute(
+  const size_t size,
+  const void * const inptr,
+  const size_t in_row_stride,
+  const size_t in_col_stride,
+  void * const outptr,
+  const size_t out_row_stride,
+  const size_t out_col_stride,
+  const unsigned int pad_top,
+  const unsigned int pad_left,
+  const unsigned int pad_bottom,
+  const unsigned int pad_right
+)
+{
+  for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
+  {
+    for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
+    {
+      std::memcpy(
+        static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
+        static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
+        size
+      );
+    }
+  }
+}
+
+template class CopyCropped<2, 2>;
+template class CopyCropped<3, 3>;
+template class CopyCropped<4, 4>;
+
+template <typename T>
+void crop_and_copy_tile(
+  unsigned int tile_rows,
+  unsigned int tile_cols,
+  unsigned int n_channels,
+  const T *inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  T *outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride,
+  unsigned int crop_top,
+  unsigned int crop_left,
+  unsigned int crop_bottom,
+  unsigned int crop_right
+)
+{
+  for (unsigned int out_i = 0, in_i = crop_top; in_i < tile_rows - crop_bottom; out_i++, in_i++)
+  {
+    for (unsigned int out_j = 0, in_j = crop_left; in_j < tile_cols - crop_right; out_j++, in_j++)
+    {
+      std::memcpy(
+        outptr + out_i*out_row_stride + out_j*out_col_stride,
+        inptr + in_i*in_row_stride + in_j*in_col_stride,
+        sizeof(T) * n_channels
+      );
+    }
+  }
+}
+
+template void crop_and_copy_tile(
+  unsigned int tile_rows,
+  unsigned int tile_cols,
+  unsigned int n_channels,
+  const float *inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  float *outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride,
+  unsigned int crop_top,
+  unsigned int crop_left,
+  unsigned int crop_bottom,
+  unsigned int crop_right
+);
+
+}  // namespace padding
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
deleted file mode 100644
index e66300d..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_1x8_fp32_process_tile(
-  int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* const matrix_base,
-  const int matrix_stride,
-     const int _pad_top,
-     const int _pad_left,
-     const int _pad_bottom,
-    const int _pad_right
-)
-{
-  (void) input_row_stride;  // No rows over which to stride
- (void) _pad_top;  // Never any top padding
-  (void) _pad_bottom;  // Never any bottom padding
-
-  // Extract padding arguments
-  const int pad_left = Specialized ? PadLeft : _pad_left;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-
-  constexpr int inner_tile_cols = 8;
-  const int cells_j = inner_tile_cols - pad_right;
-
-  float *outptr = matrix_base;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_cols];
-  for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
-  {
-    x_ptrs[j] = input_base + xj*input_col_stride;
-  }
-
-  // Vectors used/computed in this kernel.
-  float x[inner_tile_cols];
-  float U[inner_tile_cols];
-
-  for (int j = 0; j < inner_tile_cols; j++)
-  {
-    x[j] = 0.0f;
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __arm_any__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    float32x4_t x[inner_tile_cols], U[inner_tile_cols];
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vdupq_n_f32(0.0f);
-    }
-
-    // Load x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      x[j] = vld1q_f32(x_ptrs[j]);
-      x_ptrs[j] += 4;
-    }
-
-    // Compute U = x . X
-    U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
-    U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
-    U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
-    U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
-    U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
-    U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
-    U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
-    U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      vst1q_f32(outptr + j*matrix_stride, U[j]);
-    }
-    outptr += 4;
-  }
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    float32x2_t x[inner_tile_cols], U[inner_tile_cols];
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[j] = vdup_n_f32(0.0f);
-    }
-
-    // Load x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      x[j] = vld1_f32(x_ptrs[j]);
-      x_ptrs[j] += 2;
-    }
-
-    // Compute U = x . X
-    U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
-    U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
-    U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
-    U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
-    U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
-    U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
-    U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
-    U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      vst1_f32(outptr + j*matrix_stride, U[j]);
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      x[j] = *(x_ptrs[j]++);
-    }
-
-    // Compute U = x . X
-    U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
-    U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
-    U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
-    U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
-    U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
-    U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
-    U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
-    U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
-
-    // Store the transformed vector
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      *(outptr + j*matrix_stride) = U[j];
-    }
-    outptr++;
-  }
-}
-
-}
-
-namespace winograd
-{
-template <int x>
-using Tiles = InputTransformImplTiles<1, x, 1, 8, float>;
-
-/*****************************************************************************/
-// 1x3 specialisations
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-/*****************************************************************************/
-// 1x5 specialisations
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-/*****************************************************************************/
-// 1x7 specialisations
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>,
-};
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
-  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-
-template class InputTransform<1, 3, 1, 8, float>;
-template class InputTransform<3, 1, 8, 1, float>;
-template class InputTransform<1, 5, 1, 8, float>;
-template class InputTransform<5, 1, 8, 1, float>;
-template class InputTransform<1, 7, 1, 8, float>;
-template class InputTransform<7, 1, 8, 1, float>;
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
deleted file mode 100644
index 4203945..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>;
-
-namespace
-{
-
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_4x4_fp32_process_tile(
-  int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* const matrix_base,
-    const int matrix_stride,
-     const int _pad_top,
-     const int _pad_left,
-     const int _pad_bottom,
-     const int _pad_right
-  )
-{
-const int pad_top = Specialized ? PadTop : _pad_top;
-  const int pad_left = Specialized ? PadLeft : _pad_left;
-  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-
-  constexpr int inner_tile_i = 4, inner_tile_j = 4;
-  const int cells_i = inner_tile_i - pad_bottom;
-  const int cells_j = inner_tile_i - pad_right;
-
-
-
-  float *outptr = matrix_base;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_i][inner_tile_j];
-  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[inner_tile_i][inner_tile_j];
-  float XTx[inner_tile_i][inner_tile_j];
-  float U[inner_tile_i][inner_tile_j];
-
-  for (int i = 0; i < inner_tile_i; i++)
-  {
-    for (int j = 0; j < inner_tile_j; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel.
-    float32x4_t x[inner_tile_i][inner_tile_j];
-    float32x4_t XTx[inner_tile_i][inner_tile_j];
-    float32x4_t U[inner_tile_i][inner_tile_j];
-
-    for (int i = 0; i < inner_tile_i; i++)
-    {
-      for (int j = 0; j < inner_tile_j; j++)
-      {
-        x[i][j] = vdupq_n_f32(0.0f);
-        XTx[i][j] = vdupq_n_f32(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1q_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_i; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_i; i++)
-    {
-      for (int j = 0; j < inner_tile_j; j++, m++)
-      {
-        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel.
-    float32x2_t x[inner_tile_i][inner_tile_j];
-    float32x2_t XTx[inner_tile_i][inner_tile_j];
-    float32x2_t U[inner_tile_i][inner_tile_j];
-
-    for (int i = 0; i < inner_tile_i; i++)
-    {
-      for (int j = 0; j < inner_tile_j; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] = x[0][j] - x[2][j];
-      XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
-
-      // XTx[1][j] = x[1][j] + x[2][j];
-      XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
-
-      // XTx[2][j] = x[2][j] - x[1][j];
-      XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
-
-      // XTx[3][j] = x[1][j] - x[3][j];
-      XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_i; i++)
-    {
-      // U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
-
-      // U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
-
-      // U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
-
-      // U[i][3] = XTx[i][1] - XTx[i][3];
-      U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_i; i++)
-    {
-      for (int j = 0; j < inner_tile_j; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      XTx[0][j] = x[0][j] - x[2][j];
-      XTx[1][j] = x[1][j] + x[2][j];
-      XTx[2][j] = x[2][j] - x[1][j];
-      XTx[3][j] = x[1][j] - x[3][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_i; i++)
-    {
-      U[i][0] = XTx[i][0] - XTx[i][2];
-      U[i][1] = XTx[i][1] + XTx[i][2];
-      U[i][2] = XTx[i][2] - XTx[i][1];
-      U[i][3] = XTx[i][1] - XTx[i][3];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_i; i++)
-    {
-      for (int j = 0; j < inner_tile_j; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-
-}  // namespace (anonymous)
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>;
-
-
-template <>
-const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = {
-  winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>,
-};
-
-template class InputTransform<3, 3, 4, 4, float>;
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
deleted file mode 100644
index 893122c..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_6x6_fp32_process_tile(
-  int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* const matrix_base,
-const int matrix_stride,
-     const int _pad_top,
-     const int _pad_left,
-     const int _pad_bottom,
-     const int _pad_right
-)
-{
-  const int pad_top = Specialized ? PadTop : _pad_top;
-  const int pad_left = Specialized ? PadLeft : _pad_left;
-  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-
- constexpr int inner_tile_rows = 6;
-  constexpr int inner_tile_cols = 6;
-
-  const int cells_i = inner_tile_rows - pad_bottom;
-  const int cells_j = inner_tile_cols - pad_right;
-
-  float *outptr = matrix_base;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
-  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[inner_tile_rows][inner_tile_cols];
-  float XTx[inner_tile_rows][inner_tile_cols];
-  float U[inner_tile_rows][inner_tile_cols];
-  for (int i = 0; i < inner_tile_rows; i++)
-  {
-    for (int j = 0; j < inner_tile_cols; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel
-    float32x4_t x[inner_tile_rows][inner_tile_cols];
-    float32x4_t XTx[inner_tile_rows][inner_tile_cols];
-    float32x4_t U[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdupq_n_f32(0.0f);
-        XTx[i][j] = vdupq_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1q_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel
-    float32x2_t x[inner_tile_rows][inner_tile_cols];
-    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
-    float32x2_t U[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-}
-
-namespace winograd
-{
-template <int k>
-using Tiles = InputTransformImplTiles<k, k, 6, 6, float>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
-
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
-
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
-  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
-};
-
-template class InputTransform<3, 3, 6, 6, float>;
-template class InputTransform<5, 5, 6, 6, float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
deleted file mode 100644
index 597b074..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_2x2_3x3_fp32_process_tile(
-  const int n_channels,
-  const float* const matrix_base,
-  const int matrix_stride,
-  const float* const biases,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
-)
-{
-  constexpr int OutputTileRows = 2, OutputTileCols = 2;
-  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-
-  const int cells_i = OutputTileRows - pad_bottom;
-  const int cells_j = OutputTileCols - pad_right;
-
-  // Construct a map to the output cells
-  float *outptrs[OutputTileRows][OutputTileCols];
-  for (int i = 0; i < cells_i; i++)
-  {
-    for (int j = 0; j < cells_j; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
-
-  if (bptr)
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
-        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-      }
-
-      // Load the bias vector
-      b = vld1q_f32(bptr);
-      bptr += 4;
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
-
-        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-      }
-
-      // Load the bias vector
-      b = vld1_f32(bptr);
-      bptr += 2;
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[4][4], FZ[4][2], f[2][2], b;
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      }
-
-      // Load the bias
-      b = *(bptr++);
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j] + b;
-        }
-      }
-    }
-  }
-  else
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[4][4], FZ[4][2], f[2][2];
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
-        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[4][4], FZ[4][2], f[2][2];
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
-
-        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[4][4], FZ[4][2], f[2][2];
-
-      // Read a 4x4 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 4; i++)
-      {
-        for (int j = 0; j < 4; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 4; i++)
-      {
-        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0>
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1>
-};
-
-template class OutputTransform<3, 3, 4, 4, float>;
-}  // namespace winograd
-
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
deleted file mode 100644
index 60d7181..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_2x2_5x5_fp32_process_tile(
-  const int n_channels,
-  const float* const matrix_base,
-  const int matrix_stride,
-  const float* const biases,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
-)
-{
-  constexpr int OutputTileRows = 2, OutputTileCols = 2;
-  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-
-  const int cells_i = 2 - pad_bottom;
-  const int cells_j = 2 - pad_right;
-
-  // Construct a map to the output cells
-  float *outptrs[OutputTileRows][OutputTileCols];
-  for (int i = 0; i < cells_i; i++)
-  {
-    for (int j = 0; j < cells_j; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
-
-  if (bptr)
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      b = vld1q_f32(bptr);
-      bptr += 4;
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      b = vld1_f32(bptr);
-      bptr += 2;
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[6][6], FZ[6][2], f[2][2], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      }
-
-      // Write out the output tile
-      b = *(bptr++);
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j] + b;
-        }
-      }
-    }
-  }
-  else
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[6][6], FZ[6][2], f[2][2];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[6][6], FZ[6][2], f[2][2];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif  // __arm_any__
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[6][6], FZ[6][2], f[2][2];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 2; j++)
-      {
-        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0>
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1>
-};
-
-template class OutputTransform<5, 5, 6, 6, float>;
-}  // namespace winograd
-
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
deleted file mode 100644
index 15cc04b..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_4x4_3x3_fp32_process_tile(
-  const int n_channels,
-  const float* const matrix_base,
-  const int matrix_stride,
-  const float* const biases,
-  float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
-)
-{
-  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
-  const int pad_right = Specialized ? PadRight : _pad_right;
-  constexpr int TileRows = 4, TileCols = 4;
-
-  const int cells_i = TileRows - pad_bottom;
-  const int cells_j = TileCols - pad_right;
-
-  // Construct a map to the output cells
-  float *outptrs[TileRows][TileCols];
-  for (int i = 0; i < cells_i; i++)
-  {
-    for (int j = 0; j < cells_j; j++)
-    {
-      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
-    }
-  }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
-
-  if (bptr)
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
-        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
-        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      b = vld1q_f32(bptr);
-      bptr += 4;
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
-        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
-        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      b = vld1_f32(bptr);
-      bptr += 2;
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[6][6], FZ[6][4], f[4][4], b;
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      }
-
-      // Write out the output tile
-      b = *(bptr++);
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j] + b;
-        }
-      }
-    }
-  }
-  else
-  {
-    // For each channel of the output
-    int channels_remaining = n_channels;
-#ifdef __aarch64__
-    for (; channels_remaining >= 4; channels_remaining -= 4)
-    {
-      // Matrices used and computed during this transform
-      float32x4_t F[6][6], FZ[6][4], f[4][4];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 4;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
-        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
-        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1q_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 4;
-        }
-      }
-    }
-#endif  // __aarch64__
-#ifdef __arm_any__
-    for (; channels_remaining >= 2; channels_remaining -= 2)
-    {
-      // Matrices used and computed during this transform
-      float32x2_t F[6][6], FZ[6][4], f[4][4];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = vld1_f32(inptr + m*matrix_stride);
-        }
-      }
-      inptr += 2;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
-        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
-        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
-        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          vst1_f32(outptrs[i][j], f[i][j]);
-          outptrs[i][j] += 2;
-        }
-      }
-    }
-#endif
-    for (; channels_remaining; channels_remaining--)
-    {
-      // Matrices used and computed during this transform
-      float F[6][6], FZ[6][4], f[4][4];
-
-      // Read a 6x6 tile in the Winograd domain
-      for (int i = 0, m = 0; i < 6; i++)
-      {
-        for (int j = 0; j < 6; j++, m++)
-        {
-          F[i][j] = *(inptr + m*matrix_stride);
-        }
-      }
-      inptr++;
-
-      // Compute the matrix F Z
-      for (int i = 0; i < 6; i++)
-      {
-        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      }
-
-      // Compute the output tile f = ZT F Z
-      for (int j = 0; j < 4; j++)
-      {
-        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      }
-
-      // Write out the output tile
-      for (int i = 0; i < cells_i; i++)
-      {
-        for (int j = 0; j < cells_j; j++)
-        {
-          *(outptrs[i][j]++) = f[i][j];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
-  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
-};
-
-template class OutputTransform<3, 3, 6, 6, float>;
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
deleted file mode 100644
index 85cf418..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
-  template <>
-  template <>
-  void WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const float *inptrs[kernel_cols];
-    for (int j = 0; j < kernel_cols; j++)
-    {
-      inptrs[j] = input + j*weight_col_stride;
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[kernel_cols], V[inner_tile_cols];
-
-        // Read weights
-        for (int j = 0; j < kernel_cols; j++)
-        {
-          w[j] = *(inptrs[j]++);
-        }
-
-        // Compute V = w WT
-        V[0] = (w[0]*-1) / 36.0f;
-        V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
-        V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
-        V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
-        V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
-        V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
-        V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
-        V[7] = (w[6]*1) / 1.0f;
-
-        // Store the transformed weights
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-          *(outptr + j*matrix_stride) = V[j];
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template <>
-  template <>
-  void WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Redirect to the 1xN implementation
-    WinogradGEMM<1, 2, 1, 7>::template WeightsTransform<float>::execute(
-      n_output_channels, n_input_channels, input, output, matrix_stride,
-      matrix_row_stride
-    );
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template struct WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>;
-  template struct WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
deleted file mode 100644
index 6c71461..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
-  template <>
-  template <>
-  void WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    constexpr int inner_tile_i = 4;
-    constexpr int inner_tile_j = 4;
-
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const auto weight_row_stride = 3 * weight_col_stride;
-    const float *inptrs[3][3];
-    for (int i = 0; i < 3; i++)
-    {
-      for (int j = 0; j < 3; j++)
-      {
-        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-      }
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-      for (; channels_remaining >= 4; channels_remaining -= 4)
-      {
-        // Matrices used and computed in this kernel
-        float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = vld1q_f32(inptrs[i][j]);
-            inptrs[i][j] += 4;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          Ww[0][j] = w[0][j];
-
-          // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-          Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-          // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-          Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-          Ww[3][j] = w[2][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < inner_tile_i; i++)
-        {
-          V[i][0] = Ww[i][0];
-
-          // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-          V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-          // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-          V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-          V[i][3] = Ww[i][2];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < inner_tile_i; i++)
-        {
-          for (int j = 0; j < inner_tile_j; j++, m++)
-          {
-            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 4;
-      }
-#endif  // __aarch64__
-#ifdef __arm_any__
-      for (; channels_remaining >= 2; channels_remaining -= 2)
-      {
-        // Matrices used and computed in this kernel
-        float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = vld1_f32(inptrs[i][j]);
-            inptrs[i][j] += 2;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          Ww[0][j] = w[0][j];
-
-          // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-          Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-          // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-          Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
-          Ww[3][j] = w[2][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < inner_tile_i; i++)
-        {
-          V[i][0] = Ww[i][0];
-
-          // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-          V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-          // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-          V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
-          V[i][3] = Ww[i][2];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < inner_tile_i; i++)
-        {
-          for (int j = 0; j < inner_tile_j; j++, m++)
-          {
-            vst1_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 2;
-      }
-#endif  // __arm_any__
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = *(inptrs[i][j]++);
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          Ww[0][j] = w[0][j];
-          Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
-          Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
-          Ww[3][j] = w[2][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < inner_tile_i; i++)
-        {
-          V[i][0] = Ww[i][0];
-          V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
-          V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
-          V[i][3] = Ww[i][2];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < inner_tile_i; i++)
-        {
-          for (int j = 0; j < inner_tile_j; j++, m++)
-          {
-            *(outptr + m*matrix_stride) = V[i][j];
-          }
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    const int channel_prod = shape.n_input_channels * shape.n_output_channels;
-    return 2 * 18 * channel_prod;
-  }
-
-  template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>;
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
deleted file mode 100644
index 2f4f6e1..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
-  template <>
-  template <>
-  void WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const auto weight_row_stride = 5 * weight_col_stride;
-    const float *inptrs[5][5];
-    for (int i = 0; i < 5; i++)
-    {
-      for (int j = 0; j < 5; j++)
-      {
-        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-      }
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-      for (; channels_remaining >= 4; channels_remaining -= 4)
-      {
-        // Matrices used and computed in this kernel
-        float32x4_t w[5][5], Ww[6][5], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 5; i++)
-        {
-          for (int j = 0; j < 5; j++)
-          {
-            w[i][j] = vld1q_f32(inptrs[i][j]);
-            inptrs[i][j] += 4;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 5; j++)
-        {
-          // Ww[0][j] = w[0][j]/4.0f;
-          Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
-
-          // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-          Ww[1][j] = vmulq_n_f32(
-            vaddq_f32(
-              vaddq_f32(
-                vaddq_f32(w[1][j], w[0][j]),
-                vaddq_f32(w[3][j], w[2][j])
-              ),
-              w[4][j]
-            ),
-            -1.0f/6.0f
-          );
-
-          // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-          // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
-          Ww[2][j] = vmulq_n_f32(
-            vsubq_f32(
-              vaddq_f32(
-                vsubq_f32(w[1][j], w[0][j]),
-                vsubq_f32(w[3][j], w[2][j])
-              ),
-              w[4][j]
-            ),
-            1.0f/6.0f
-          );
-
-          // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-          Ww[3][j] = vmulq_n_f32(
-            vmlaq_n_f32(
-              vaddq_f32(
-                vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
-                vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-              ),
-              w[4][j], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-          Ww[4][j] = vmulq_n_f32(
-            vmlaq_n_f32(
-              vaddq_f32(
-                vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
-                vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-              ),
-              w[4][j], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // Ww[5][j] = w[4][j];
-          Ww[5][j] = w[4][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          // V[i][0] = Ww[i][0]/4.0f;
-          V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
-
-          // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-          V[i][1] = vmulq_n_f32(
-            vaddq_f32(
-              vaddq_f32(
-                vaddq_f32(Ww[i][1], Ww[i][0]),
-                vaddq_f32(Ww[i][3], Ww[i][2])
-              ),
-              Ww[i][4]
-            ),
-            -1.0f/6.0f
-          );
-
-          // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-          // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
-          V[i][2] = vmulq_n_f32(
-            vsubq_f32(
-              vaddq_f32(
-                vsubq_f32(Ww[i][1], Ww[i][0]),
-                vsubq_f32(Ww[i][3], Ww[i][2])
-              ),
-              Ww[i][4]
-            ),
-            1.0f/6.0f
-          );
-
-          // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][3] = vmulq_n_f32(
-            vmlaq_n_f32(
-              vaddq_f32(
-                vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
-                vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-              ),
-              Ww[i][4], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][4] = vmulq_n_f32(
-            vmlaq_n_f32(
-              vaddq_f32(
-                vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
-                vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-              ),
-              Ww[i][4], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // V[i][5] = Ww[i][4];
-          V[i][5] = Ww[i][4];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 4;
-      }
-#endif  // __aarch64__
-#ifdef __arm_any__
-      for (; channels_remaining >= 2; channels_remaining -= 2)
-      {
-        // Matrices used and computed in this kernel
-        float32x2_t w[5][5], Ww[6][5], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 5; i++)
-        {
-          for (int j = 0; j < 5; j++)
-          {
-            w[i][j] = vld1_f32(inptrs[i][j]);
-            inptrs[i][j] += 2;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 5; j++)
-        {
-          // Ww[0][j] = w[0][j]/4.0f;
-          Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
-
-          // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-          Ww[1][j] = vmul_n_f32(
-            vadd_f32(
-              vadd_f32(
-                vadd_f32(w[1][j], w[0][j]),
-                vadd_f32(w[3][j], w[2][j])
-              ),
-              w[4][j]
-            ),
-            -1.0f/6.0f
-          );
-
-          // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-          // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
-          Ww[2][j] = vmul_n_f32(
-            vsub_f32(
-              vadd_f32(
-                vsub_f32(w[1][j], w[0][j]),
-                vsub_f32(w[3][j], w[2][j])
-              ),
-              w[4][j]
-            ),
-            1.0f/6.0f
-          );
-
-          // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-          Ww[3][j] = vmul_n_f32(
-            vmla_n_f32(
-              vadd_f32(
-                vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
-                vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-              ),
-              w[4][j], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-          Ww[4][j] = vmul_n_f32(
-            vmla_n_f32(
-              vadd_f32(
-                vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
-                vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
-              ),
-              w[4][j], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // Ww[5][j] = w[4][j];
-          Ww[5][j] = w[4][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          // V[i][0] = Ww[i][0]/4.0f;
-          V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
-
-          // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-          V[i][1] = vmul_n_f32(
-            vadd_f32(
-              vadd_f32(
-                vadd_f32(Ww[i][1], Ww[i][0]),
-                vadd_f32(Ww[i][3], Ww[i][2])
-              ),
-              Ww[i][4]
-            ),
-            -1.0f/6.0f
-          );
-
-          // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-          // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
-          V[i][2] = vmul_n_f32(
-            vsub_f32(
-              vadd_f32(
-                vsub_f32(Ww[i][1], Ww[i][0]),
-                vsub_f32(Ww[i][3], Ww[i][2])
-              ),
-              Ww[i][4]
-            ),
-            1.0f/6.0f
-          );
-
-          // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][3] = vmul_n_f32(
-            vmla_n_f32(
-              vadd_f32(
-                vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
-                vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-              ),
-              Ww[i][4], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][4] = vmul_n_f32(
-            vmla_n_f32(
-              vadd_f32(
-                vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
-                vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
-              ),
-              Ww[i][4], 2.0f
-            ),
-            1.0f/3.0f
-          );
-
-          // V[i][5] = Ww[i][4];
-          V[i][5] = Ww[i][4];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            vst1_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 2;
-      }
-#endif  // __arm_any__
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[5][5], Ww[6][5], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 5; i++)
-        {
-          for (int j = 0; j < 5; j++)
-          {
-            w[i][j] = *(inptrs[i][j]++);
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 5; j++)
-        {
-          Ww[0][j] = w[0][j]/4.0f;
-          Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
-          Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
-          Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
-          Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
-          Ww[5][j] = w[4][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          V[i][0] = Ww[i][0]/4.0f;
-          V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
-          V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
-          V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
-          V[i][5] = Ww[i][4];
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            *(outptr + m*matrix_stride) = V[i][j];
-          }
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    return 0;  // TODO
-  }
-
-  template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>;
-}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
deleted file mode 100644
index 2f14e20..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
-  template <>
-  template <>
-  void WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const float *inptrs[kernel_cols];
-    for (int j = 0; j < kernel_cols; j++)
-    {
-      inptrs[j] = input + j*weight_col_stride;
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[kernel_cols], V[inner_tile_cols];
-
-        // Read weights
-        for (int j = 0; j < kernel_cols; j++)
-        {
-          w[j] = *(inptrs[j]++);
-        }
-
-        // Compute V = w WT
-        V[0] = (w[0]*-1) / 36;
-        V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
-        V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
-        V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
-        V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
-        V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
-        V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
-        V[7] = (w[4]*1) / 1;
-
-        // Store the transformed weights
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-          *(outptr + j*matrix_stride) = V[j];
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template <>
-  template <>
-  void WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Redirect to the 1xN implementation
-    WinogradGEMM<1, 4, 1, 5>::template WeightsTransform<float>::execute(
-      n_output_channels, n_input_channels, input, output, matrix_stride,
-      matrix_row_stride
-    );
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template struct WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>;
-  template struct WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
deleted file mode 100644
index a56a475..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
-  /* Float implementation for kernel transform F(4x4, 3x3) */
-  template <>
-  template <>
-  void WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const auto weight_row_stride = 3 * weight_col_stride;
-    const float *inptrs[3][3];
-    for (int i = 0; i < 3; i++)
-    {
-      for (int j = 0; j < 3; j++)
-      {
-        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
-      }
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-#ifdef __aarch64__
-      for (; channels_remaining >= 4; channels_remaining -= 4)
-      {
-        // Matrices used and computed in this kernel
-        float32x4_t w[3][3], Ww[6][3], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = vld1q_f32(inptrs[i][j]);
-            inptrs[i][j] += 4;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          // Ww[0][j] =  6*w[0][j];
-          Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
-
-          // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-          Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-          // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-          Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-          // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-          Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-          // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-          Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-          // Ww[5][j] = 24*w[2][j];
-          Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          const float recip576 = 1.0f / 576.0f;
-
-          // V[i][0] =  6*Ww[i][0];
-          V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
-
-          // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-          V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-          // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-          V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-          // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-          V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-          // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-          V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-          // V[i][5] = 24*Ww[i][2];
-          V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 4;
-      }
-#endif  // __aarch64__
-#ifdef __arm_any__
-      for (; channels_remaining >= 2; channels_remaining -= 2)
-      {
-        // Matrices used and computed in this kernel
-        float32x2_t w[3][3], Ww[6][3], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = vld1_f32(inptrs[i][j]);
-            inptrs[i][j] += 2;
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          // Ww[0][j] =  6*w[0][j];
-          Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
-
-          // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-          Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
-          // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-          Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
-          // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-          Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-          // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-          Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
-          // Ww[5][j] = 24*w[2][j];
-          Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          const float recip576 = 1.0f / 576.0f;
-
-          // V[i][0] =  6*Ww[i][0];
-          V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
-
-          // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
-          V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
-          // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
-          V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
-          // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
-          V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-          // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
-          V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
-          // V[i][5] = 24*Ww[i][2];
-          V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            vst1_f32(outptr + m*matrix_stride, V[i][j]);
-          }
-        }
-        outptr += 2;
-      }
-#endif  // __arm_any__
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[3][3], Ww[6][3], V[6][6];
-
-        // Read weights
-        for (int i = 0; i < 3; i++)
-        {
-          for (int j = 0; j < 3; j++)
-          {
-            w[i][j] = *(inptrs[i][j]++);
-          }
-        }
-
-        // Compute the matrix W w
-        for (int j = 0; j < 3; j++)
-        {
-          Ww[0][j] =  6*w[0][j];
-          Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
-          Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
-          Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
-          Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
-          Ww[5][j] = 24*w[2][j];
-        }
-
-        // Compute V = W w WT
-        for (int i = 0; i < 6; i++)
-        {
-          V[i][0] = ( 6*Ww[i][0]) / 576.0;
-          V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-          V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
-          V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-          V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
-          V[i][5] = (24*Ww[i][2]) / 576.0;
-        }
-
-        // Store the transformed weights
-        for (int i = 0, m = 0; i < 6; i++)
-        {
-          for (int j = 0; j < 6; j++, m++)
-          {
-            *(outptr + m*matrix_stride) = V[i][j];
-          }
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    const int channel_prod = shape.n_input_channels * shape.n_output_channels;
-    return 9 * 16 * channel_prod;
-  }
-
-  template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
deleted file mode 100644
index c560aa8..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-
-namespace winograd
-{
-  template <>
-  template <>
-  void WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Get pointers to each cell of the weight tensor
-    const auto weight_col_stride = n_input_channels * n_output_channels;
-    const float *inptrs[3];
-    for (int j = 0; j < 3; j++)
-    {
-      inptrs[j] = input + j*weight_col_stride;
-    }
-
-    // For each input channel
-    for (int ic = 0; ic < n_input_channels; ic++)
-    {
-      float *outptr = output + ic * matrix_row_stride;
-
-      // For each output channel
-      int channels_remaining = n_output_channels;
-      for (; channels_remaining; channels_remaining--)
-      {
-        // Matrices used and computed in this kernel
-        float w[3], V[inner_tile_cols];
-
-        // Read weights
-        for (int j = 0; j < 3; j++)
-        {
-          w[j] = *(inptrs[j]++);
-        }
-
-        // Compute V = w WT
-        V[0] = (w[0]*-1) / 36.0f;
-        V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
-        V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
-        V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
-        V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
-        V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
-        V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
-        V[7] = (w[2]*1) / 1;
-
-        // Store the transformed weights
-        for (int j = 0; j < inner_tile_cols; j++)
-        {
-          *(outptr + j*matrix_stride) = V[j];
-        }
-        outptr++;
-      }
-    }
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template <>
-  template <>
-  void WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::execute(
-    const int n_output_channels,
-    const int n_input_channels,
-    const float* const input,  // NOTE: Data in HWIO order
-    float* const output,
-    const int matrix_stride,
-    const int matrix_row_stride
-  )
-  {
-    // Redirect to the 1xN implementation
-    WinogradGEMM<1, 6, 1, 3>::template WeightsTransform<float>::execute(
-      n_output_channels, n_input_channels, input, output, matrix_stride,
-      matrix_row_stride
-    );
-  }
-
-  template <>
-  template <>
-  int WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
-  {
-    (void) shape;
-    return 0;  // TODO
-  }
-
-  template struct WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>;
-  template struct WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
similarity index 64%
rename from src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd.cpp
index a7de2fd..226f303 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,14 +22,13 @@
  * SOFTWARE.
  */
 #include <cstring>
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
+#include "winograd.hpp"
 using namespace winograd;
 
 /** Get the output shape of a convolution. */
-template <int kr, int kc, int itr, int itc>
-template <typename TOut, typename TIn>
-Tensor4DShape WinogradGEMM<kr, kc, itr, itc>::Convolution<TOut, TIn>::get_output_shape(
+template <int kr, int kc, int itr, int itc, WinogradRoots R>
+template <typename TOut, typename TIn, typename TInGEMM, typename TOutGEMM>
+Tensor4DShape WinogradGEMM<kr, kc, itr, itc, R>::Convolution<TOut, TIn, TInGEMM, TOutGEMM>::get_output_shape(
   const KernelShape &kernel_shape,
   const Tensor4DShape &in_shape,
   const PaddingType padding
@@ -47,9 +46,9 @@
 /* Get the memory required to transform the kernel.
  */
 template <int kernel_rows, int kernel_cols,
-          int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_transform_working_size(const KernelShape &shape)
+          int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_transform_working_size(const KernelShape &shape)
 {
   if (shape.ordering == HWIO)
   {
@@ -68,17 +67,17 @@
 /** Get the memory required to store the kernel transformed into the
  * Winograd domain.
  */
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_storage_size(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_storage_size(const KernelShape &shape)
 {
   return N_GEMMS * get_kernel_matrix_size(shape);
 }
 
 
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_storage_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_storage_size(
   const KernelShape &kernel_shape,
   const Tensor4DShape &input_shape,
   const PaddingType padding
@@ -88,9 +87,9 @@
 }
 
 
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_storage_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_storage_size(
   const KernelShape &kernel_shape,
   const Tensor4DShape &input_shape,
   const PaddingType padding
@@ -102,9 +101,9 @@
 
 /** Get the memory required to apply a Winograd operator to some input.
  */
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_working_space_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_working_space_size(
   const KernelShape &kernel_shape,
   const Tensor4DShape &input_shape,
   const PaddingType padding_type
@@ -139,20 +138,20 @@
 
 /* Get the memory required by a single "input" matrix.
  */
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_size(
   const KernelShape &kernel_shape,
   const Tensor4DShape &input_shape,
   const PaddingType padding_type
 )
 {
-  return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn);
+  return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGIn);
 }
 
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_stride(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_stride(
   const KernelShape &kernel_shape,
   const Tensor4DShape &input_shape,
   const PaddingType padding_type
@@ -171,21 +170,21 @@
 
 /* Get the memory required by a single "output" matrix.
  */
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_size(
     const KernelShape &kernel_shape,
     const Tensor4DShape &input_shape,
     const PaddingType padding_type
 )
 {
-  return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut);
+  return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGOut);
 }
 
 
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_stride(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_stride(
     const KernelShape &kernel_shape,
     const Tensor4DShape &input_shape,
     const PaddingType padding_type
@@ -204,16 +203,16 @@
 
 /* Get the memory required by a single "kernel" matrix.
  */
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_size(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_size(const KernelShape &shape)
 {
-  return sizeof(TIn) * get_kernel_matrix_stride(shape);
+  return sizeof(TGIn) * get_kernel_matrix_stride(shape);
 }
 
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_stride(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_stride(const KernelShape &shape)
 {
   const int K = shape.n_input_channels;
   const int N = roundup(shape.n_output_channels, N_BLOCK);
@@ -222,19 +221,16 @@
 
 
 // Instantiate required implementations
-template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
-template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
+template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
 
-template class WinogradGEMM<1, 6, 1, 3>::Convolution<float, float>;
-template class WinogradGEMM<6, 1, 3, 1>::Convolution<float, float>;
+template class WinogradGEMM<1, 6, 1, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<6, 1, 3, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
 
-template class WinogradGEMM<2, 2, 5, 5>::Convolution<float, float>;
+template class WinogradGEMM<2, 2, 5, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
 
-template class WinogradGEMM<1, 4, 1, 5>::Convolution<float, float>;
-template class WinogradGEMM<4, 1, 5, 1>::Convolution<float, float>;
+template class WinogradGEMM<1, 4, 1, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<4, 1, 5, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
 
-template class WinogradGEMM<1, 2, 1, 7>::Convolution<float, float>;
-template class WinogradGEMM<2, 1, 7, 1>::Convolution<float, float>;
-
-
-
+template class WinogradGEMM<1, 2, 1, 7, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<2, 1, 7, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
new file mode 100644
index 0000000..fcbd21f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include "padding.hpp"
+
+#define MEMBERFN(RTYPE) template <\
+  int InnerTileRows, int InnerTileCols,\
+  typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE InputTransform<InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+
+#define Nx1MEMBERFN(RTYPE) template <\
+  int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE InputTransform<InnerTileRows, 1, TIn, TOut, Roots>
+
+namespace winograd
+{
+
+MEMBERFN()::InputTransform(
+  const int kernel_rows,
+  const int kernel_cols,
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  const int padding_top,
+  const int padding_left,
+  const int padding_bottom,
+  const int padding_right
+) : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
+    _inptr(nullptr), _outptr(nullptr),
+    _overlap_rows(kernel_rows - 1), _overlap_cols(kernel_cols - 1),
+    _padding_top(padding_top), _padding_left(padding_left), _padding_bottom(padding_bottom), _padding_right(padding_right),
+    _tiles_M(iceildiv(padding_top + n_rows + padding_bottom - kernel_rows + 1, InnerTileRows - kernel_rows + 1)),
+    _tiles_N(iceildiv(padding_left + n_cols + padding_right - kernel_cols + 1, InnerTileCols - kernel_cols + 1)),
+    _matrix_stride(0), _matrix_row_stride(0), _matrix_batch_stride(0),
+    _in_col_stride(0), _in_row_stride(0), _in_batch_stride(0),
+    _working_space_col_stride(n_channels),
+    _working_space_row_stride(InnerTileCols * _working_space_col_stride),
+    _working_space(nullptr)
+{
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr)
+{
+  set_input_tensor(inptr, _n_channels);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
+{
+  set_input_tensor(inptr, _n_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
+{
+  set_input_tensor(inptr, _n_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+  _inptr = static_cast<const TIn *>(inptr);
+  _in_batch_stride = ldbatch;
+  _in_row_stride = ldrow;
+  _in_col_stride = ldcol;
+}
+
+MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
+{
+  _outptr = static_cast<TOut *>(mptr);
+  _matrix_stride = ldmatrix;
+  _matrix_row_stride = ldrow;
+  _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
+}
+
+Nx1MEMBERFN()::InputTransform(
+  const int kernel_rows,
+  const int kernel_cols,
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  const int padding_top,
+  const int padding_left,
+  const int padding_bottom,
+  const int padding_right
+) : InputTransform<1, InnerTileRows, TIn, TOut, Roots>::InputTransform(
+    /* Transpose rows and columns */
+    kernel_cols, kernel_rows, n_batches, n_cols, n_rows, n_channels,
+    padding_left, padding_top, padding_right, padding_bottom
+  )
+{
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr)
+{
+  set_input_tensor(inptr, this->_n_channels);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
+{
+  set_input_tensor(inptr, this->_n_cols * ldcol, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
+{
+  set_input_tensor(inptr, this->_n_rows * ldrow, ldrow, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+  // Transpose row and column strides
+  Base::set_input_tensor(inptr, ldbatch, ldcol, ldrow);
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+  return sizeof(TIn) * InnerTileRows * _working_space_row_stride * nthreads;
+}
+
+MEMBERFN(void)::set_working_space(void * const buffer)
+{
+  _working_space = static_cast<TIn *>(buffer);
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+  return iceildiv(_n_channels, WINDOW_BLOCK);
+}
+
+MEMBERFN(void)::run(
+  const unsigned int start,
+  const unsigned int stop,
+  const unsigned int threadid
+)
+{
+  // Determine the channels on which to work
+  if (start >= get_window())
+  {
+    return;  // No work to do beyond the end of the window
+  }
+  const unsigned int start_channel = start * WINDOW_BLOCK;
+  const unsigned int stop_channel = std::min<unsigned int>(_n_channels , stop * WINDOW_BLOCK);
+  const unsigned int n_channels = stop_channel - start_channel;
+
+  // Loop over batches
+  for (int batch = 0; batch < _n_batches; batch++)
+  {
+    const TIn* const inptr_batch = _inptr + start_channel + batch*_in_batch_stride;
+    TOut* const outptr_batch = _outptr + start_channel + batch*_matrix_batch_stride;
+
+    // Loop over rows of tiles
+    for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
+    {
+      // Compute the starting and ending row of pixels within the row of tiles,
+      // hence compute the padding to apply to the top and bottom of each tile.
+      const int row_top = tile_i * (InnerTileRows - _overlap_rows) - _padding_top;
+      const int row_bottom = row_top + InnerTileRows;
+      const int row_pad_top = std::max(0, _padding_top - tile_i * (InnerTileRows - _overlap_rows));
+      const int row_pad_bottom = std::max(0, row_bottom - _n_rows);
+
+      // Get a pointer to the start of the row.
+      const int row_offset = std::min(0, row_pad_top - _padding_top);
+      const TIn* const inptr_row = inptr_batch + _in_row_stride*(row_offset + tile_i*(InnerTileRows - _overlap_rows));
+      TOut* const outptr_row = outptr_batch + tile_i*_tiles_N*_matrix_row_stride;
+
+      // Loop over tiles within the row
+      for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
+      {
+        // Compute the starting and ending column of pixels within the tile,
+        // hence compute the padding to apply to the left and right of the
+        // tile.
+        const int tile_left = tile_j * (InnerTileCols - _overlap_cols) - _padding_left;
+        const int tile_right = tile_left + InnerTileCols;
+        const int tile_pad_left = std::max(0, _padding_left - tile_j * (InnerTileCols - _overlap_cols));
+        const int tile_pad_right = std::max(0, tile_right - _n_cols);
+
+        // Get a pointer to the start of the tile.
+        const int col_offset = std::min(0, tile_pad_left - _padding_left);
+        const TIn* const inptr_tile = inptr_row + _in_col_stride*(col_offset + tile_j*(InnerTileCols - _overlap_cols));
+        TOut* const outptr_tile = outptr_row + tile_j * _matrix_row_stride;
+
+        // Transform the tile, applying padding if necessary.
+        if (row_pad_top || tile_pad_left || row_pad_bottom || tile_pad_right)
+        {
+          transform_padded_tile(
+            threadid, n_channels, outptr_tile, inptr_tile,
+            row_pad_top, tile_pad_left, row_pad_bottom, tile_pad_right
+          );
+        }
+        else
+        {
+          transform_unpadded_tile(threadid, n_channels, outptr_tile, inptr_tile);
+        }
+      }
+    }
+  }
+}
+
+MEMBERFN(void)::transform_unpadded_tile(
+  const unsigned int /* threadid unused */,
+  const int n_channels,
+  TOut * const outptr,
+  const TIn * const inptr
+)
+{
+  transform_tile(
+    n_channels, inptr, _in_row_stride, _in_col_stride, outptr, _matrix_stride
+  );
+}
+
+MEMBERFN(void)::transform_padded_tile(
+  const unsigned int threadid,
+  const int n_channels,
+  TOut * const outptr,
+  const TIn * const inptr,
+  const int padding_top,
+  const int padding_left,
+  const int padding_bottom,
+  const int padding_right
+)
+{
+  padding::copy_and_pad_tile(
+    InnerTileRows, InnerTileCols, n_channels,
+    inptr, _in_row_stride, _in_col_stride,
+    static_cast<TIn *>(get_working_space(threadid)), _working_space_row_stride, _working_space_col_stride,
+    padding_top, padding_left, padding_bottom, padding_right
+  );
+
+  transform_tile(
+    n_channels, static_cast<const TIn *>(get_working_space(threadid)),
+    _working_space_row_stride, _working_space_col_stride,
+    outptr, _matrix_stride
+  );
+}
+
+MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
+{
+  return _working_space + InnerTileRows * _working_space_row_stride * threadid;
+}
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..5040ec1
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "input.hpp"
+
+namespace winograd
+{
+
+template <>
+void InputTransform<1, 8, float, float, WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* const input_base,
+  const int,  // We don't need to stride over rows
+  const int input_col_stride,
+  float* outptr,
+  const int matrix_stride
+)
+{
+  constexpr int inner_tile_cols = 8;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_cols];
+  for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+  {
+    x_ptrs[j] = input_base + xj*input_col_stride;
+  }
+
+  // Vectors used/computed in this kernel.
+  float x[inner_tile_cols];
+  float U[inner_tile_cols];
+
+  for (int j = 0; j < inner_tile_cols; j++)
+  {
+    x[j] = 0.0f;
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef _arm_any_
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    float32x4_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdupq_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vld1q_f32(x_ptrs[j]);
+      x_ptrs[j] += 4;
+    }
+
+    // Compute U = x . X
+    U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1q_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 4;
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    float32x2_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdup_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vld1_f32(x_ptrs[j]);
+      x_ptrs[j] += 2;
+    }
+
+    // Compute U = x . X
+    U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 2;
+  }
+#endif  // _arm_any_
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = *(x_ptrs[j]++);
+    }
+
+    // Compute U = x . X
+    U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
+    U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
+    U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
+    U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
+    U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
+    U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
+    U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
+    U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      *(outptr + j*matrix_stride) = U[j];
+    }
+    outptr++;
+  }
+}
+
+template class InputTransform<1, 8, float, float, WinogradRoots::Integers>;
+template class InputTransform<8, 1, float, float, WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..9393785
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "input.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+template <>
+void InputTransform<4, 4, float, float, WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* outptr,
+  const int matrix_stride
+)
+{
+  constexpr int inner_tile_rows = 4, inner_tile_cols = 4;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_rows][inner_tile_cols];
+  float XTx[inner_tile_rows][inner_tile_cols];
+  float U[inner_tile_rows][inner_tile_cols];
+
+  for (int i = 0; i < inner_tile_rows; i++)
+  {
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used/computed in this kernel.
+    float32x4_t x[inner_tile_rows][inner_tile_cols];
+    float32x4_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x4_t U[inner_tile_rows][inner_tile_cols];
+
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdupq_n_f32(0.0f);
+        XTx[i][j] = vdupq_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1q_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 4;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 4;
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel.
+    float32x2_t x[inner_tile_rows][inner_tile_cols];
+    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x2_t U[inner_tile_rows][inner_tile_cols];
+
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      XTx[0][j] = x[0][j] - x[2][j];
+      XTx[1][j] = x[1][j] + x[2][j];
+      XTx[2][j] = x[2][j] - x[1][j];
+      XTx[3][j] = x[1][j] - x[3][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][3] = XTx[i][1] - XTx[i][3];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+template class InputTransform<4, 4, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..908fc82
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "input.hpp"
+
+namespace winograd
+{
+
+#ifdef __aarch64__
+
+template <>
+void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+  int n_channels,
+  const float* input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* matrix_base,
+  const int matrix_stride
+)
+{
+  const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
+  __asm__ __volatile__(
+    "ldr q0, [%[pcoeffs]]\n"
+    "add x25, %[inptr0], %[input_row_stride]\n"
+    "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x16, x25, %[input_row_stride]\n"
+    "add x19, x18, %[input_col_stride1]\n"
+    "add x26, x16, %[input_row_stride]\n"
+    "add x20, x19, %[input_col_stride1]\n"
+    "add x17, x26, %[input_row_stride]\n"
+    "add x21, x20, %[input_col_stride1]\n"
+    "add x27, x17, %[input_row_stride]\n"
+    "add x28, %[outptr0], %[output_row_stride]\n"
+    "add x11, %[output_col_stride1], %[output_col_stride1]\n"
+    "add x22, x28, %[output_row_stride]\n"
+    "add x13, x11, %[output_col_stride1]\n"
+    "add x12, x22, %[output_row_stride]\n"
+    "add x23, x13, %[output_col_stride1]\n"
+    "add x14, x12, %[output_row_stride]\n"
+    "add x15, x23, %[output_col_stride1]\n"
+    "add x24, x14, %[output_row_stride]\n"
+    "cmp %w[n_channels], #4\n"
+    "blt 2f\n"
+    "1:\n"
+    "ldr q8, [%[inptr0], x20]\n"
+    "ldr q2, [%[inptr0], x18]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr q9, [%[inptr0]]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr q1, [%[inptr0], x21]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr q4, [%[inptr0], x19]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr q5, [x16, x20]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr q20, [x16, x18]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr q3, [x16]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr q6, [x16, x21]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr q16, [x16, x19]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr q22, [x16, %[input_col_stride1]]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr q17, [x17, x20]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr q15, [x17, x18]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr q19, [x17]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr q18, [x17, x21]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr q13, [x17, x19]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr q21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #16\n"
+    "mov v11.16b, v1.16b\n"
+    "add x16, x16, #16\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #16\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "mov v2.16b, v5.16b\n"
+    "mov v3.16b, v5.16b\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "mov v4.16b, v5.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "mov v19.16b, v17.16b\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str q23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str q15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str q14, [%[outptr0], x11]\n"
+    "str q24, [%[outptr0], x13]\n"
+    "str q10, [%[outptr0], x23]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str q7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #16\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr q11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr q23, [x25, x18]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr q7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr q13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr q31, [x25, x19]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr q21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr q30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr q29, [x26, x18]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr q22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr q24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr q27, [x26, x19]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr q28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #16\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #16\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str q26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str q25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str q27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str q31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str q30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str q29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #16\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str q26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str q25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str q30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str q29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str q28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str q26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str q25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str q31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #16\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str q27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str q26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str q25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str q28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #16\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str q12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str q19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str q20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str q16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str q1, [x14, x23]\n"
+    "str q17, [x14, x15]\n"
+    "add x14, x14, #16\n"
+    "ldr q2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr q17, [x27, x18]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr q18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr q3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr q5, [x27, x19]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr q18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #16\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "sub %w[n_channels], %w[n_channels], #4\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "cmp %w[n_channels], #4\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "mov v3.16b, v3.16b\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "mov v4.16b, v4.16b\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v5.16b, v6.16b\n"
+    "mov v6.16b, v1.16b\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "mov v9.16b, v2.16b\n"
+    "mov v10.16b, v3.16b\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str q4, [x24]\n"
+    "str q16, [x24, %[output_col_stride1]]\n"
+    "str q5, [x24, x11]\n"
+    "str q6, [x24, x13]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str q9, [x24, x23]\n"
+    "str q10, [x24, x15]\n"
+    "add x24, x24, #16\n"
+    "bge 1b\n"
+    "2:\n"
+    "cmp %w[n_channels], #2\n"
+    "blt 3f\n"
+    "ldr d8, [%[inptr0], x20]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr d2, [%[inptr0], x18]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr d9, [%[inptr0]]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr d1, [%[inptr0], x21]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr d4, [%[inptr0], x19]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr d5, [x16, x20]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr d20, [x16, x18]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr d3, [x16]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr d6, [x16, x21]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr d16, [x16, x19]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr d22, [x16, %[input_col_stride1]]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr d17, [x17, x20]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr d15, [x17, x18]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr d19, [x17]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr d18, [x17, x21]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr d13, [x17, x19]\n"
+    "mov v11.16b, v1.16b\n"
+    "ldr d21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #8\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "add x16, x16, #8\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #8\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "mov v2.16b, v5.16b\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "mov v3.16b, v5.16b\n"
+    "mov v4.16b, v5.16b\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "mov v19.16b, v17.16b\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str d23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "str d15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str d14, [%[outptr0], x11]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str d24, [%[outptr0], x13]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str d10, [%[outptr0], x23]\n"
+    "str d7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #8\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr d23, [x25, x18]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr d7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr d13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr d31, [x25, x19]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr d21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr d30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr d29, [x26, x18]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr d22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr d24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr d27, [x26, x19]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr d28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #8\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #8\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str d26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str d25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str d27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str d31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str d30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str d29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #8\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str d26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str d25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str d30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str d29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str d28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str d26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str d25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str d31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #8\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str d27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str d26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str d25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str d28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #8\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str d12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str d19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str d20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str d16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str d1, [x14, x23]\n"
+    "str d17, [x14, x15]\n"
+    "add x14, x14, #8\n"
+    "ldr d2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr d17, [x27, x18]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr d18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr d3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr d5, [x27, x19]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr d18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #8\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "sub %w[n_channels], %w[n_channels], #2\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "mov v3.16b, v3.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "mov v4.16b, v4.16b\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "mov v16.16b, v12.16b\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "mov v5.16b, v6.16b\n"
+    "mov v6.16b, v1.16b\n"
+    "mov v9.16b, v2.16b\n"
+    "mov v10.16b, v3.16b\n"
+    "str d4, [x24]\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str d16, [x24, %[output_col_stride1]]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str d5, [x24, x11]\n"
+    "str d6, [x24, x13]\n"
+    "str d9, [x24, x23]\n"
+    "str d10, [x24, x15]\n"
+    "add x24, x24, #8\n"
+    "3:\n"
+    "cbz %w[n_channels], 4f\n"
+    "ldr s8, [%[inptr0], x20]\n"
+    "mov v14.16b, v8.16b\n"
+    "ldr s2, [%[inptr0], x18]\n"
+    "mov v10.16b, v8.16b\n"
+    "ldr s9, [%[inptr0]]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "ldr s1, [%[inptr0], x21]\n"
+    "mov v9.16b, v8.16b\n"
+    "ldr s4, [%[inptr0], x19]\n"
+    "mov v7.16b, v8.16b\n"
+    "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
+    "fmls v14.4s, v2.4s, v0.s[3]\n"
+    "ldr s5, [x16, x20]\n"
+    "fmls v10.4s, v12.4s, v0.s[2]\n"
+    "ldr s20, [x16, x18]\n"
+    "fmla v9.4s, v12.4s, v0.s[2]\n"
+    "ldr s3, [x16]\n"
+    "fmls v7.4s, v12.4s, v0.s[1]\n"
+    "ldr s6, [x16, x21]\n"
+    "fmls v10.4s, v2.4s, v0.s[2]\n"
+    "ldr s16, [x16, x19]\n"
+    "fmls v9.4s, v2.4s, v0.s[2]\n"
+    "ldr s22, [x16, %[input_col_stride1]]\n"
+    "fsub v7.4s, v7.4s, v2.4s\n"
+    "ldr s17, [x17, x20]\n"
+    "fadd v10.4s, v10.4s, v4.4s\n"
+    "ldr s15, [x17, x18]\n"
+    "fsub v9.4s, v9.4s, v4.4s\n"
+    "ldr s19, [x17]\n"
+    "fmla v7.4s, v4.4s, v0.s[1]\n"
+    "ldr s18, [x17, x21]\n"
+    "mov v8.16b, v8.16b\n"
+    "ldr s13, [x17, x19]\n"
+    "mov v11.16b, v1.16b\n"
+    "ldr s21, [x17, %[input_col_stride1]]\n"
+    "fmla v8.4s, v12.4s, v0.s[1]\n"
+    "add %[inptr0], %[inptr0], #4\n"
+    "fmla v11.4s, v12.4s, v0.s[2]\n"
+    "add x16, x16, #4\n"
+    "mov v1.16b, v5.16b\n"
+    "add x17, x17, #4\n"
+    "fsub v8.4s, v8.4s, v2.4s\n"
+    "mov v2.16b, v5.16b\n"
+    "fmls v8.4s, v4.4s, v0.s[1]\n"
+    "fmls v11.4s, v4.4s, v0.s[3]\n"
+    "fmla v1.4s, v3.4s, v0.s[2]\n"
+    "fmls v2.4s, v22.4s, v0.s[2]\n"
+    "mov v3.16b, v5.16b\n"
+    "mov v4.16b, v5.16b\n"
+    "mov v5.16b, v5.16b\n"
+    "mov v6.16b, v6.16b\n"
+    "fmls v1.4s, v20.4s, v0.s[3]\n"
+    "fmls v2.4s, v20.4s, v0.s[2]\n"
+    "fmla v3.4s, v22.4s, v0.s[2]\n"
+    "fmls v4.4s, v22.4s, v0.s[1]\n"
+    "fmla v5.4s, v22.4s, v0.s[1]\n"
+    "fmla v6.4s, v22.4s, v0.s[2]\n"
+    "fadd v2.4s, v2.4s, v16.4s\n"
+    "mov v12.16b, v17.16b\n"
+    "fmls v3.4s, v20.4s, v0.s[2]\n"
+    "fsub v4.4s, v4.4s, v20.4s\n"
+    "fmla v4.4s, v16.4s, v0.s[1]\n"
+    "fsub v5.4s, v5.4s, v20.4s\n"
+    "fmls v5.4s, v16.4s, v0.s[1]\n"
+    "fmls v6.4s, v16.4s, v0.s[3]\n"
+    "fsub v3.4s, v3.4s, v16.4s\n"
+    "fmla v12.4s, v19.4s, v0.s[2]\n"
+    "mov v19.16b, v17.16b\n"
+    "mov v20.16b, v17.16b\n"
+    "mov v16.16b, v17.16b\n"
+    "mov v17.16b, v17.16b\n"
+    "fmls v12.4s, v15.4s, v0.s[3]\n"
+    "fmls v19.4s, v21.4s, v0.s[2]\n"
+    "fmla v20.4s, v21.4s, v0.s[2]\n"
+    "fmls v16.4s, v21.4s, v0.s[1]\n"
+    "fmla v17.4s, v21.4s, v0.s[1]\n"
+    "mov v18.16b, v18.16b\n"
+    "fmls v19.4s, v15.4s, v0.s[2]\n"
+    "mov v23.16b, v12.16b\n"
+    "fmls v20.4s, v15.4s, v0.s[2]\n"
+    "fsub v16.4s, v16.4s, v15.4s\n"
+    "fmla v16.4s, v13.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v15.4s\n"
+    "fadd v19.4s, v19.4s, v13.4s\n"
+    "fmls v17.4s, v13.4s, v0.s[1]\n"
+    "fsub v20.4s, v20.4s, v13.4s\n"
+    "fmla v18.4s, v21.4s, v0.s[2]\n"
+    "fmla v23.4s, v14.4s, v0.s[2]\n"
+    "mov v15.16b, v19.16b\n"
+    "mov v14.16b, v20.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "fmls v18.4s, v13.4s, v0.s[3]\n"
+    "fmla v15.4s, v10.4s, v0.s[2]\n"
+    "fmls v23.4s, v1.4s, v0.s[3]\n"
+    "fmla v14.4s, v9.4s, v0.s[2]\n"
+    "fmla v24.4s, v7.4s, v0.s[2]\n"
+    "mov v10.16b, v17.16b\n"
+    "fmls v15.4s, v2.4s, v0.s[3]\n"
+    "mov v7.16b, v18.16b\n"
+    "str s23, [%[outptr0]]\n"
+    "fmls v14.4s, v3.4s, v0.s[3]\n"
+    "fmls v24.4s, v4.4s, v0.s[3]\n"
+    "fmla v10.4s, v8.4s, v0.s[2]\n"
+    "str s15, [%[outptr0], %[output_col_stride1]]\n"
+    "fmla v7.4s, v11.4s, v0.s[2]\n"
+    "str s14, [%[outptr0], x11]\n"
+    "fmls v10.4s, v5.4s, v0.s[3]\n"
+    "str s24, [%[outptr0], x13]\n"
+    "fmls v7.4s, v6.4s, v0.s[3]\n"
+    "str s10, [%[outptr0], x23]\n"
+    "str s7, [%[outptr0], x15]\n"
+    "add %[outptr0], %[outptr0], #4\n"
+    "mov v26.16b, v12.16b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr s11, [x25, x20]\n"
+    "mov v10.16b, v11.16b\n"
+    "ldr s23, [x25, x18]\n"
+    "mov v9.16b, v11.16b\n"
+    "ldr s7, [x25]\n"
+    "fmla v10.4s, v7.4s, v0.s[2]\n"
+    "ldr s13, [x25, x21]\n"
+    "mov v7.16b, v11.16b\n"
+    "ldr s31, [x25, x19]\n"
+    "mov v8.16b, v11.16b\n"
+    "ldr s21, [x25, %[input_col_stride1]]\n"
+    "fmls v10.4s, v23.4s, v0.s[3]\n"
+    "ldr s30, [x26, x20]\n"
+    "fmls v9.4s, v21.4s, v0.s[2]\n"
+    "ldr s29, [x26, x18]\n"
+    "fmla v7.4s, v21.4s, v0.s[2]\n"
+    "ldr s22, [x26]\n"
+    "fmls v8.4s, v21.4s, v0.s[1]\n"
+    "ldr s24, [x26, x21]\n"
+    "fmls v9.4s, v23.4s, v0.s[2]\n"
+    "ldr s27, [x26, x19]\n"
+    "fmls v7.4s, v23.4s, v0.s[2]\n"
+    "ldr s28, [x26, %[input_col_stride1]]\n"
+    "fsub v8.4s, v8.4s, v23.4s\n"
+    "add x25, x25, #4\n"
+    "fadd v9.4s, v9.4s, v31.4s\n"
+    "add x26, x26, #4\n"
+    "fsub v7.4s, v7.4s, v31.4s\n"
+    "fmla v8.4s, v31.4s, v0.s[1]\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v15.16b, v13.16b\n"
+    "mov v14.16b, v30.16b\n"
+    "mov v13.16b, v30.16b\n"
+    "fmla v11.4s, v21.4s, v0.s[1]\n"
+    "fmla v15.4s, v21.4s, v0.s[2]\n"
+    "fmla v14.4s, v22.4s, v0.s[2]\n"
+    "fmls v13.4s, v28.4s, v0.s[2]\n"
+    "mov v21.16b, v30.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "fsub v11.4s, v11.4s, v23.4s\n"
+    "fmls v15.4s, v31.4s, v0.s[3]\n"
+    "fmls v11.4s, v31.4s, v0.s[1]\n"
+    "fmls v14.4s, v29.4s, v0.s[3]\n"
+    "fmls v13.4s, v29.4s, v0.s[2]\n"
+    "fmla v21.4s, v28.4s, v0.s[2]\n"
+    "fmls v22.4s, v28.4s, v0.s[1]\n"
+    "mov v23.16b, v30.16b\n"
+    "mov v24.16b, v24.16b\n"
+    "fmls v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v13.4s, v13.4s, v27.4s\n"
+    "fmls v21.4s, v29.4s, v0.s[2]\n"
+    "fsub v22.4s, v22.4s, v29.4s\n"
+    "fmla v23.4s, v28.4s, v0.s[1]\n"
+    "fmla v22.4s, v27.4s, v0.s[1]\n"
+    "fmla v24.4s, v28.4s, v0.s[2]\n"
+    "fsub v21.4s, v21.4s, v27.4s\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fsub v23.4s, v23.4s, v29.4s\n"
+    "fmls v25.4s, v9.4s, v0.s[2]\n"
+    "fmls v23.4s, v27.4s, v0.s[1]\n"
+    "fmls v24.4s, v27.4s, v0.s[3]\n"
+    "fadd v26.4s, v26.4s, v14.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str s26, [x28]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "fmls v27.4s, v7.4s, v0.s[2]\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v17.16b\n"
+    "mov v29.16b, v18.16b\n"
+    "fadd v25.4s, v25.4s, v13.4s\n"
+    "fmls v31.4s, v8.4s, v0.s[2]\n"
+    "str s25, [x28, %[output_col_stride1]]\n"
+    "fmls v27.4s, v3.4s, v0.s[2]\n"
+    "fmls v30.4s, v11.4s, v0.s[2]\n"
+    "fmls v29.4s, v15.4s, v0.s[2]\n"
+    "fmls v31.4s, v4.4s, v0.s[2]\n"
+    "mov v26.16b, v12.16b\n"
+    "fadd v27.4s, v27.4s, v21.4s\n"
+    "mov v25.16b, v19.16b\n"
+    "str s27, [x28, x11]\n"
+    "fmls v30.4s, v5.4s, v0.s[2]\n"
+    "fadd v31.4s, v31.4s, v22.4s\n"
+    "fmls v29.4s, v6.4s, v0.s[2]\n"
+    "str s31, [x28, x13]\n"
+    "fmla v26.4s, v10.4s, v0.s[2]\n"
+    "fadd v30.4s, v30.4s, v23.4s\n"
+    "fmla v25.4s, v9.4s, v0.s[2]\n"
+    "str s30, [x28, x23]\n"
+    "fadd v29.4s, v29.4s, v24.4s\n"
+    "str s29, [x28, x15]\n"
+    "fmls v26.4s, v1.4s, v0.s[2]\n"
+    "fmls v25.4s, v2.4s, v0.s[2]\n"
+    "add x28, x28, #4\n"
+    "mov v30.16b, v20.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "fsub v26.4s, v26.4s, v14.4s\n"
+    "mov v28.16b, v17.16b\n"
+    "str s26, [x22]\n"
+    "fsub v25.4s, v25.4s, v13.4s\n"
+    "str s25, [x22, %[output_col_stride1]]\n"
+    "fmla v30.4s, v7.4s, v0.s[2]\n"
+    "fmla v29.4s, v8.4s, v0.s[2]\n"
+    "fmla v28.4s, v11.4s, v0.s[2]\n"
+    "mov v26.16b, v18.16b\n"
+    "mov v25.16b, v12.16b\n"
+    "fmls v30.4s, v3.4s, v0.s[2]\n"
+    "mov v31.16b, v19.16b\n"
+    "fmls v29.4s, v4.4s, v0.s[2]\n"
+    "fmls v28.4s, v5.4s, v0.s[2]\n"
+    "fmla v26.4s, v15.4s, v0.s[2]\n"
+    "fmls v25.4s, v10.4s, v0.s[1]\n"
+    "fsub v30.4s, v30.4s, v21.4s\n"
+    "fmls v31.4s, v9.4s, v0.s[1]\n"
+    "str s30, [x22, x11]\n"
+    "fsub v29.4s, v29.4s, v22.4s\n"
+    "str s29, [x22, x13]\n"
+    "fsub v28.4s, v28.4s, v23.4s\n"
+    "str s28, [x22, x23]\n"
+    "fmls v26.4s, v6.4s, v0.s[2]\n"
+    "fsub v25.4s, v25.4s, v1.4s\n"
+    "fsub v31.4s, v31.4s, v2.4s\n"
+    "fmla v25.4s, v14.4s, v0.s[1]\n"
+    "fmla v31.4s, v13.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v24.4s\n"
+    "mov v27.16b, v20.16b\n"
+    "str s26, [x22, x15]\n"
+    "mov v26.16b, v16.16b\n"
+    "str s25, [x12]\n"
+    "fmls v27.4s, v7.4s, v0.s[1]\n"
+    "str s31, [x12, %[output_col_stride1]]\n"
+    "fmls v26.4s, v8.4s, v0.s[1]\n"
+    "mov v25.16b, v17.16b\n"
+    "add x22, x22, #4\n"
+    "fsub v27.4s, v27.4s, v3.4s\n"
+    "mov v28.16b, v18.16b\n"
+    "fmla v27.4s, v21.4s, v0.s[1]\n"
+    "fsub v26.4s, v26.4s, v4.4s\n"
+    "fmla v26.4s, v22.4s, v0.s[1]\n"
+    "fmls v25.4s, v11.4s, v0.s[1]\n"
+    "fmls v28.4s, v15.4s, v0.s[1]\n"
+    "mov v12.16b, v12.16b\n"
+    "str s27, [x12, x11]\n"
+    "mov v19.16b, v19.16b\n"
+    "str s26, [x12, x13]\n"
+    "fsub v25.4s, v25.4s, v5.4s\n"
+    "fmla v25.4s, v23.4s, v0.s[1]\n"
+    "fsub v28.4s, v28.4s, v6.4s\n"
+    "fmla v28.4s, v24.4s, v0.s[1]\n"
+    "fmla v12.4s, v10.4s, v0.s[1]\n"
+    "fmla v19.4s, v9.4s, v0.s[1]\n"
+    "mov v20.16b, v20.16b\n"
+    "str s25, [x12, x23]\n"
+    "mov v16.16b, v16.16b\n"
+    "str s28, [x12, x15]\n"
+    "fsub v12.4s, v12.4s, v1.4s\n"
+    "fmls v12.4s, v14.4s, v0.s[1]\n"
+    "add x12, x12, #4\n"
+    "fsub v19.4s, v19.4s, v2.4s\n"
+    "fmla v20.4s, v7.4s, v0.s[1]\n"
+    "fmls v19.4s, v13.4s, v0.s[1]\n"
+    "fmla v16.4s, v8.4s, v0.s[1]\n"
+    "str s12, [x14]\n"
+    "mov v1.16b, v17.16b\n"
+    "fsub v20.4s, v20.4s, v3.4s\n"
+    "mov v17.16b, v18.16b\n"
+    "str s19, [x14, %[output_col_stride1]]\n"
+    "fmls v20.4s, v21.4s, v0.s[1]\n"
+    "fsub v16.4s, v16.4s, v4.4s\n"
+    "fmla v1.4s, v11.4s, v0.s[1]\n"
+    "fmls v16.4s, v22.4s, v0.s[1]\n"
+    "fmla v17.4s, v15.4s, v0.s[1]\n"
+    "str s20, [x14, x11]\n"
+    "fsub v1.4s, v1.4s, v5.4s\n"
+    "str s16, [x14, x13]\n"
+    "fmls v1.4s, v23.4s, v0.s[1]\n"
+    "fsub v17.4s, v17.4s, v6.4s\n"
+    "fmls v17.4s, v24.4s, v0.s[1]\n"
+    "str s1, [x14, x23]\n"
+    "str s17, [x14, x15]\n"
+    "add x14, x14, #4\n"
+    "ldr s2, [x27, x20]\n"
+    "mov v4.16b, v2.16b\n"
+    "ldr s17, [x27, x18]\n"
+    "mov v12.16b, v2.16b\n"
+    "ldr s18, [x27]\n"
+    "fmla v4.4s, v18.4s, v0.s[2]\n"
+    "ldr s3, [x27, x21]\n"
+    "mov v6.16b, v2.16b\n"
+    "ldr s5, [x27, x19]\n"
+    "mov v1.16b, v2.16b\n"
+    "ldr s18, [x27, %[input_col_stride1]]\n"
+    "fmls v4.4s, v17.4s, v0.s[3]\n"
+    "add x27, x27, #4\n"
+    "fmls v12.4s, v18.4s, v0.s[2]\n"
+    "fmla v6.4s, v18.4s, v0.s[2]\n"
+    "fmls v1.4s, v18.4s, v0.s[1]\n"
+    "mov v2.16b, v2.16b\n"
+    "mov v3.16b, v3.16b\n"
+    "mov v4.16b, v4.16b\n"
+    "fmls v12.4s, v17.4s, v0.s[2]\n"
+    "fmls v6.4s, v17.4s, v0.s[2]\n"
+    "fsub v1.4s, v1.4s, v17.4s\n"
+    "fmla v2.4s, v18.4s, v0.s[1]\n"
+    "fmla v1.4s, v5.4s, v0.s[1]\n"
+    "fmla v3.4s, v18.4s, v0.s[2]\n"
+    "fadd v12.4s, v12.4s, v5.4s\n"
+    "fsub v6.4s, v6.4s, v5.4s\n"
+    "fsub v2.4s, v2.4s, v17.4s\n"
+    "fmla v4.4s, v10.4s, v0.s[2]\n"
+    "fmls v2.4s, v5.4s, v0.s[1]\n"
+    "fmls v3.4s, v5.4s, v0.s[3]\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v5.16b, v6.16b\n"
+    "fmls v4.4s, v14.4s, v0.s[3]\n"
+    "mov v6.16b, v1.16b\n"
+    "fmla v16.4s, v9.4s, v0.s[2]\n"
+    "fmla v5.4s, v7.4s, v0.s[2]\n"
+    "fmla v6.4s, v8.4s, v0.s[2]\n"
+    "mov v9.16b, v2.16b\n"
+    "str s4, [x24]\n"
+    "mov v10.16b, v3.16b\n"
+    "fmls v16.4s, v13.4s, v0.s[3]\n"
+    "fmls v5.4s, v21.4s, v0.s[3]\n"
+    "fmls v6.4s, v22.4s, v0.s[3]\n"
+    "fmla v9.4s, v11.4s, v0.s[2]\n"
+    "fmla v10.4s, v15.4s, v0.s[2]\n"
+    "str s16, [x24, %[output_col_stride1]]\n"
+    "str s5, [x24, x11]\n"
+    "fmls v9.4s, v23.4s, v0.s[3]\n"
+    "str s6, [x24, x13]\n"
+    "fmls v10.4s, v24.4s, v0.s[3]\n"
+    "str s9, [x24, x23]\n"
+    "str s10, [x24, x15]\n"
+    "add x24, x24, #4\n"
+    "4:\n"
+    : [outptr0] "+r" (matrix_base),
+      [n_channels] "+r" (n_channels),
+      [inptr0] "+r" (input_base)
+    : [pcoeffs] "r" (pcoeffs),
+      [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
+      [output_col_stride1] "r" (matrix_stride * sizeof(float)),
+      [input_row_stride] "r" (input_row_stride * sizeof(float)),
+      [input_col_stride1] "r" (input_col_stride * sizeof(float))
+    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+      "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+      "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
+      "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
+      "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+  );
+}
+
+#else  // __arm__ not __aarch64__
+
+template <>
+void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* outptr,
+  const int matrix_stride
+)
+{
+  constexpr int inner_tile_rows = 6;
+  constexpr int inner_tile_cols = 6;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+  for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_rows][inner_tile_cols];
+  float XTx[inner_tile_rows][inner_tile_cols];
+  float U[inner_tile_rows][inner_tile_cols];
+  for (int i = 0; i < inner_tile_rows; i++)
+  {
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel
+    float32x2_t x[inner_tile_rows][inner_tile_cols];
+    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x2_t U[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+#endif
+
+template class InputTransform<6, 6, float, float, WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
new file mode 100644
index 0000000..e45f186
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "winograd.hpp"
+using namespace winograd;
+
+#define MEMBERFN(RTYPE) template <\
+  int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE WeightTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+MEMBERFN()::WeightTransform(
+  const int n_output_channels,
+  const int n_input_channels
+) : _n_output_channels(n_output_channels), _n_input_channels(n_input_channels),
+    _matrices(nullptr), _matrix_stride(0), _matrix_row_stride(0), _weights(nullptr)
+{
+
+}
+
+MEMBERFN(void)::set_weight_tensor(const void * const weights)
+{
+  _weights = static_cast<const TIn *>(weights);
+}
+
+MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
+{
+  _matrices = static_cast<TOut *>(mptr);
+  _matrix_stride = ldmatrix;
+  _matrix_row_stride = ldrow;
+}
+
+MEMBERFN(size_t)::get_working_space_size(unsigned int) const
+{
+  return 0;
+}
+
+MEMBERFN(void)::set_working_space(void *)
+{
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+  // TODO When the weights transform supports multithreading, return the number
+  // of output channels. For now we return 1 to indicate that the weights must
+  // be transformed as a single block.
+  // return n_output_channels;
+  return 1;
+}
+
+MEMBERFN(void)::run(const unsigned int, const unsigned int, unsigned int)
+{
+  execute(
+    _n_output_channels, _n_input_channels, _weights,
+    _matrices, _matrix_stride, _matrix_row_stride
+  );
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
new file mode 100644
index 0000000..d97af21
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include "winograd.hpp"
+#include "padding.hpp"
+#include "utils.hpp"
+
+#define MEMBERFN(RTYPE) template<\
+  int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols,\
+  typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+#define Nx1MEMBERFN(RTYPE) template<\
+  int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots>
+
+namespace winograd
+{
+
+MEMBERFN()::OutputTransform(
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels
+) : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
+    _matrix_base(nullptr),
+    _biases(nullptr),
+    _matrix_stride(0), _matrix_row_stride(0), _matrix_batch_stride(0),
+    _outptr(nullptr),
+    _tiles_M(iceildiv(n_rows, output_tile_rows)),
+    _tiles_N(iceildiv(n_cols, output_tile_cols)),
+    _out_col_stride(0), _out_row_stride(0), _out_batch_stride(0),
+    _working_space_col_stride(n_channels),
+    _working_space_row_stride(output_tile_cols * _working_space_col_stride),
+    _working_space(nullptr)
+{
+}
+
+MEMBERFN(void)::set_input_matrices(const void * const mptr, const int ldmatrix, const int ldrow)
+{
+  _matrix_base = static_cast<const TIn *>(mptr);
+  _matrix_stride = ldmatrix;
+  _matrix_row_stride = ldrow;
+  _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
+}
+
+MEMBERFN(void)::set_bias(const void * const bias)
+{
+  _biases = static_cast<const TOut *>(bias);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr)
+{
+  set_output_tensor(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
+{
+  set_output_tensor(outptr, _n_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
+{
+  set_output_tensor(outptr, _n_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+  _outptr = static_cast<TOut *>(outptr);
+  _out_batch_stride = ldbatch;
+  _out_row_stride = ldrow;
+  _out_col_stride = ldcol;
+}
+
+Nx1MEMBERFN()::OutputTransform(
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels
+) : OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::OutputTransform(
+    n_batches, n_cols, n_rows, n_channels /* Transpose rows and columns */
+  )
+{
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr)
+{
+  set_output_tensor(outptr, this->_n_channels);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
+{
+  set_output_tensor(outptr, this->_n_cols * ldcol, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
+{
+  set_output_tensor(outptr, this->_n_rows * ldrow, ldrow, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+  // Transpose rows and columns
+  Base::set_output_tensor(outptr, ldbatch, ldcol, ldrow);
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+  return sizeof(TOut) * output_tile_rows * _working_space_row_stride * nthreads;
+}
+
+MEMBERFN(void)::set_working_space(void * const buffer)
+{
+  _working_space = static_cast<TOut *>(buffer);
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+  return iceildiv(_n_channels, WINDOW_BLOCK);
+}
+
+MEMBERFN(void)::run(
+  const unsigned int start,
+  const unsigned int stop,
+  const unsigned int threadid
+)
+{
+  // Determine the channels on which to work
+  if (start >= get_window())
+  {
+    return;  // No work to do beyond the end of the window
+  }
+  const unsigned int start_channel = start * WINDOW_BLOCK;
+  const unsigned int stop_channel = std::min<unsigned int>(_n_channels, stop * WINDOW_BLOCK);
+  const unsigned int n_channels = stop_channel - start_channel;
+
+  const auto matrix_tile_col_stride = _matrix_row_stride;
+  const auto matrix_tile_row_stride = _tiles_N * matrix_tile_col_stride;
+
+  const TOut* const bptr = (_biases == nullptr) ? nullptr : _biases + start_channel;
+
+  // Loop over batches
+  for (int batch = 0; batch < _n_batches; batch++)
+  {
+    const TIn* const matrix_batch = _matrix_base + start_channel + batch * _matrix_batch_stride;
+    TOut* const outptr_batch = _outptr + start_channel + batch * _out_batch_stride;
+
+    for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
+    {
+      // Compute properties of the row of output tiles
+      const int row_pad_bottom = std::max(0, (tile_i + 1)*output_tile_rows - _n_rows);
+      const TIn* const matrix_tile_row = matrix_batch + tile_i * matrix_tile_row_stride;
+      TOut* const outptr_row = outptr_batch + tile_i * output_tile_rows * _out_row_stride;
+
+      for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
+      {
+        // Compute property of this specific tile
+        const int tile_pad_right = std::max(0, (tile_j + 1)*output_tile_cols - _n_cols);
+        const TIn* const matrix_tile = matrix_tile_row + tile_j * matrix_tile_col_stride;
+        TOut* const outptr_tile = outptr_row + tile_j * output_tile_cols * _out_col_stride;
+
+        // Perform the transformation
+        if (row_pad_bottom || tile_pad_right)
+        {
+          transform_cropped_tile(
+            threadid, n_channels, outptr_tile, matrix_tile, bptr,
+            row_pad_bottom, tile_pad_right
+          );
+        }
+        else
+        {
+          transform_uncropped_tile(
+            threadid, n_channels, outptr_tile, matrix_tile, bptr
+          );
+        }
+      }
+    }
+  }
+}
+
+MEMBERFN(void)::transform_uncropped_tile(
+  const unsigned int /* threadid unused */,
+  const int n_channels,
+  TOut * const outptr,
+  const TIn * const inptr,
+  const TOut * const biases
+)
+{
+  transform_tile(
+    n_channels, inptr, _matrix_stride, biases,
+    outptr, _out_row_stride, _out_col_stride
+  );
+}
+
+MEMBERFN(void)::transform_cropped_tile(
+  const unsigned int threadid,
+  const int n_channels,
+  TOut * const outptr,
+  const TIn * const inptr,
+  const TOut * const biases,
+  const int pad_bottom,
+  const int pad_right
+)
+{
+  // Transform into working space and then copy the relevant section out.
+  TOut *wsptr = static_cast<TOut *>(get_working_space(threadid));
+  transform_tile(
+    n_channels, inptr, _matrix_stride, biases,
+    wsptr, _working_space_row_stride, _working_space_col_stride
+  );
+
+  padding::crop_and_copy_tile(
+    output_tile_rows, output_tile_cols, n_channels,
+    wsptr, _working_space_row_stride, _working_space_col_stride,
+    outptr, _out_row_stride, _out_col_stride,
+    0u, 0u, pad_bottom, pad_right
+  );
+}
+
+MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
+{
+  return _working_space + output_tile_rows * _working_space_row_stride * threadid;
+}
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
similarity index 71%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
index ea842a4..c32d7f2 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,43 +22,29 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm.hpp"
+#include "output.hpp"
 
-namespace
+namespace winograd
 {
 
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_2_7_fp32_process_tile(
+template <>
+void OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
   const int n_channels,
-  const float* const matrix_base,
+  const float* inptr,
   const int matrix_stride,
-  const float* const biases,
+  const float* bptr,
   float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
+  const int,  // No need to stride across rows
+  const int output_col_stride
 )
 {
-  (void) output_row_stride;
-  (void) _pad_bottom;
-  constexpr int output_tile_cols = 2;
-  constexpr int inner_tile_cols = 8;
-
-  const int pad_right = Specialized ? PadRight : _pad_right;
-  const int cells_j = output_tile_cols - pad_right;
-
-
   // Construct a map to the output cells
-  float *outptrs[cells_j];
-  for (int j = 0; j < cells_j; j++)
+  float *outptrs[output_tile_cols];
+  for (int j = 0; j < output_tile_cols; j++)
   {
     outptrs[j] = output + j*output_col_stride;
   }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
 
   // For each channel of the output
   int channels_remaining = n_channels;
@@ -84,7 +70,7 @@
       b = vld1q_f32(bptr);
       bptr += 4;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1q_f32(outptrs[j], f[j] + b);
       outptrs[j] += 4;
@@ -111,7 +97,7 @@
       b = vld1_f32(bptr);
       bptr += 2;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1_f32(outptrs[j], f[j] + b);
       outptrs[j] += 2;
@@ -138,26 +124,14 @@
     {
       b = *(bptr++);
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       *(outptrs[j]++) = f[j] + b;
     }
   }
 }
-}  // namespace (anonymous)
 
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>;
+template class OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
 
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_2_7_fp32_process_tile<true, 1>
-};
-
-template class OutputTransform<1, 7, 1, 8, float>;
-template class OutputTransform<7, 1, 8, 1, float>;
 }  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..d6ebf44
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "output.hpp"
+
+namespace winograd
+{
+
+template <>
+void OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* inptr,
+  const int matrix_stride,
+  const float* bptr,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  // Construct a map to the output cells
+  float *outptrs[output_tile_rows][output_tile_cols];
+  for (int i = 0; i < output_tile_rows; i++)
+  {
+    for (int j = 0; j < output_tile_cols; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Load the bias vector
+    if (bptr != nullptr)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdupq_n_f32(0.0f);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Load the bias vector
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[4][4], FZ[4][2], f[2][2], b;
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+    }
+
+    // Load the bias
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j] + b;
+      }
+    }
+  }
+}
+
+template class OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..d93d9e2
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "output.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+template <>
+void OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* inptr,
+  const int matrix_stride,
+  const float* bptr,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  // Construct a map to the output cells
+  float *outptrs[output_tile_rows][output_tile_cols];
+  for (int i = 0; i < output_tile_rows; i++)
+  {
+    for (int j = 0; j < output_tile_cols; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    else
+    {
+      b = vdupq_n_f32(0.0f);
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j] + b;
+      }
+    }
+  }
+}
+
+template class OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
similarity index 73%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
index 911759b..7187ef2 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,42 +22,29 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "output.hpp"
+#include "arm.hpp"
 
-namespace
+namespace winograd
 {
 
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_4_5_fp32_process_tile(
+template <>
+void OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
   const int n_channels,
-  const float* const matrix_base,
+  const float* inptr,
   const int matrix_stride,
-  const float* const biases,
+  const float* bptr,
   float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
+  const int,  // No need to stride across rows
+  const int output_col_stride
 )
 {
-  (void) output_row_stride;
-  (void) _pad_bottom;
-  constexpr int output_tile_cols = 4;
-  constexpr int inner_tile_cols = 8;
-
-  const int pad_right = Specialized ? PadRight : _pad_right;
-  const int cells_j = output_tile_cols - pad_right;
-
   // Construct a map to the output cells
-  float *outptrs[cells_j];
-  for (int j = 0; j < cells_j; j++)
+  float *outptrs[output_tile_cols];
+  for (int j = 0; j < output_tile_cols; j++)
   {
     outptrs[j] = output + j*output_col_stride;
   }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
 
   // For each channel of the output
   int channels_remaining = n_channels;
@@ -85,7 +72,7 @@
       b = vld1q_f32(bptr);
       bptr += 4;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1q_f32(outptrs[j], f[j] + b);
       outptrs[j] += 4;
@@ -114,7 +101,7 @@
       b = vld1_f32(bptr);
       bptr += 2;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1_f32(outptrs[j], f[j] + b);
       outptrs[j] += 2;
@@ -143,29 +130,14 @@
     {
       b = *(bptr++);
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       *(outptrs[j]++) = f[j] + b;
     }
   }
 }
 
-}  // namespace (anonymous)
+template class OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
 
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_4_5_fp32_process_tile<true, 1>,
-  winograd_output_transform_4_5_fp32_process_tile<true, 2>,
-  winograd_output_transform_4_5_fp32_process_tile<true, 3>
-};
-
-template class OutputTransform<1, 5, 1, 8, float>;
-template class OutputTransform<5, 1, 8, 1, float>;
 }  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..fd16a4d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,1855 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "output.hpp"
+
+namespace winograd
+{
+
+#ifdef __aarch64__
+
+template <>
+void OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>::transform_tile(
+  int n_channels,
+  const float* inptr,
+  const int matrix_stride,
+  const float* bptr,
+  float* output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  const float coeffs[2] = {2.0f, 4.0f};
+  if (bptr != nullptr)
+  {
+    __asm__ __volatile__ (
+      "ldr d0, [%[pcoeffs]]\n"
+      "add x21, %[in_col_stride1], %[in_col_stride1]\n"
+      "add x22, x21, %[in_col_stride1]\n"
+      "add x25, %[inptr0], %[in_row_stride]\n"
+      "add x15, %[output_col_stride1], %[output_col_stride1]\n"
+      "add x23, x22, %[in_col_stride1]\n"
+      "add x13, x25, %[in_row_stride]\n"
+      "add x16, x15, %[output_col_stride1]\n"
+      "add x24, x23, %[in_col_stride1]\n"
+      "add x26, x13, %[in_row_stride]\n"
+      "add x17, %[outptr0], %[output_row_stride]\n"
+      "add x14, x26, %[in_row_stride]\n"
+      "add x28, x17, %[output_row_stride]\n"
+      "lsr x19, %[n_channels], #2\n"
+      "add x27, x14, %[in_row_stride]\n"
+      "add x18, x28, %[output_row_stride]\n"
+      "and x20, %[n_channels], #3\n"
+      "cbz x19, 4f\n"
+      "1:\n"
+      "ldr q19, [%[inptr0]]\n"
+      "subs x19, x19, #1\n"
+      "ldr q20, [%[inptr0], %[in_col_stride1]]\n"
+      "ldr q4, [%[inptr0], x21]\n"
+      "fadd v1.4s, v20.4s, v4.4s\n"
+      "ldr q17, [%[inptr0], x22]\n"
+      "fsub v7.4s, v20.4s, v4.4s\n"
+      "ldr q22, [%[inptr0], x23]\n"
+      "fadd v5.4s, v17.4s, v22.4s\n"
+      "ldr q18, [%[inptr0], x24]\n"
+      "fsub v10.4s, v17.4s, v22.4s\n"
+      "ldr q25, [x25]\n"
+      "fadd v8.4s, v19.4s, v1.4s\n"
+      "ldr q12, [x25, %[in_col_stride1]]\n"
+      "mov v4.16b, v1.16b\n"
+      "ldr q23, [x25, x21]\n"
+      "mov v1.16b, v7.16b\n"
+      "ldr q9, [x25, x22]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "ldr q11, [x25, x23]\n"
+      "fadd v8.4s, v8.4s, v5.4s\n"
+      "ldr q6, [x25, x24]\n"
+      "fmla v4.4s, v5.4s, v0.s[1]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "fmla v1.4s, v10.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "beq 3f\n"
+      "2:\n"
+      "fadd v3.4s, v12.4s, v23.4s\n"
+      "ldr q2, [x13]\n"
+      "fadd v27.4s, v9.4s, v11.4s\n"
+      "ldr q21, [x13, %[in_col_stride1]]\n"
+      "fsub v16.4s, v12.4s, v23.4s\n"
+      "ldr q26, [x13, x21]\n"
+      "fsub v9.4s, v9.4s, v11.4s\n"
+      "ldr q17, [x13, x22]\n"
+      "fadd v14.4s, v25.4s, v3.4s\n"
+      "ldr q19, [x13, x23]\n"
+      "mov v11.16b, v3.16b\n"
+      "ldr q10, [x13, x24]\n"
+      "mov v3.16b, v16.16b\n"
+      "ldr q15, [x26]\n"
+      "fmul v9.4s, v9.4s, v0.s[0]\n"
+      "ldr q12, [x26, %[in_col_stride1]]\n"
+      "fadd v14.4s, v14.4s, v27.4s\n"
+      "ldr q20, [x26, x21]\n"
+      "fmla v11.4s, v27.4s, v0.s[1]\n"
+      "ldr q24, [x26, x22]\n"
+      "fadd v23.4s, v21.4s, v26.4s\n"
+      "ldr q29, [x26, x23]\n"
+      "fadd v13.4s, v16.4s, v9.4s\n"
+      "ldr q5, [x26, x24]\n"
+      "fmla v3.4s, v9.4s, v0.s[1]\n"
+      "ldr q18, [x14]\n"
+      "fadd v30.4s, v17.4s, v19.4s\n"
+      "add %[inptr0], %[inptr0], #16\n"
+      "fadd v16.4s, v2.4s, v23.4s\n"
+      "add x25, x25, #16\n"
+      "fsub v21.4s, v21.4s, v26.4s\n"
+      "ldr q22, [x14, %[in_col_stride1]]\n"
+      "fadd v3.4s, v3.4s, v6.4s\n"
+      "ldr q28, [x14, x21]\n"
+      "fsub v19.4s, v17.4s, v19.4s\n"
+      "add x13, x13, #16\n"
+      "fadd v16.4s, v16.4s, v30.4s\n"
+      "add x26, x26, #16\n"
+      "mov v17.16b, v23.16b\n"
+      "subs x19, x19, #1\n"
+      "fadd v26.4s, v12.4s, v20.4s\n"
+      "fsub v9.4s, v12.4s, v20.4s\n"
+      "fmul v19.4s, v19.4s, v0.s[0]\n"
+      "ldr q20, [x14, x22]\n"
+      "fmla v17.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v24.4s, v29.4s\n"
+      "fsub v12.4s, v24.4s, v29.4s\n"
+      "fadd v24.4s, v22.4s, v28.4s\n"
+      "fadd v23.4s, v15.4s, v26.4s\n"
+      "mov v15.16b, v26.16b\n"
+      "fsub v22.4s, v22.4s, v28.4s\n"
+      "fadd v29.4s, v14.4s, v16.4s\n"
+      "fsub v16.4s, v14.4s, v16.4s\n"
+      "ldr q28, [x14, x23]\n"
+      "fmul v12.4s, v12.4s, v0.s[0]\n"
+      "fmla v15.4s, v25.4s, v0.s[1]\n"
+      "fadd v23.4s, v23.4s, v25.4s\n"
+      "mov v6.16b, v21.16b\n"
+      "fadd v30.4s, v21.4s, v19.4s\n"
+      "fadd v26.4s, v18.4s, v24.4s\n"
+      "mov v25.16b, v24.16b\n"
+      "fadd v18.4s, v8.4s, v29.4s\n"
+      "fmla v6.4s, v19.4s, v0.s[1]\n"
+      "fadd v27.4s, v20.4s, v28.4s\n"
+      "fsub v21.4s, v20.4s, v28.4s\n"
+      "mov v19.16b, v29.16b\n"
+      "fadd v29.4s, v13.4s, v30.4s\n"
+      "fsub v8.4s, v13.4s, v30.4s\n"
+      "fadd v14.4s, v9.4s, v12.4s\n"
+      "fadd v6.4s, v6.4s, v10.4s\n"
+      "ldr q20, [x14, x24]\n"
+      "fadd v26.4s, v26.4s, v27.4s\n"
+      "add x14, x14, #16\n"
+      "fmla v9.4s, v12.4s, v0.s[1]\n"
+      "ldr q24, [x27]\n"
+      "fmul v21.4s, v21.4s, v0.s[0]\n"
+      "fmla v25.4s, v27.4s, v0.s[1]\n"
+      "fadd v10.4s, v7.4s, v29.4s\n"
+      "ldr q2, [%[bptr]]\n"
+      "mov v7.16b, v29.16b\n"
+      "add %[bptr], %[bptr], #16\n"
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "fadd v13.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "fadd v27.4s, v11.4s, v17.4s\n"
+      "fsub v11.4s, v11.4s, v17.4s\n"
+      "fadd v30.4s, v15.4s, v25.4s\n"
+      "fsub v15.4s, v15.4s, v25.4s\n"
+      "ldr q28, [x27, %[in_col_stride1]]\n"
+      "fadd v18.4s, v18.4s, v13.4s\n"
+      "fmla v19.4s, v13.4s, v0.s[1]\n"
+      "fadd v26.4s, v22.4s, v21.4s\n"
+      "mov v12.16b, v22.16b\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fadd v17.4s, v4.4s, v27.4s\n"
+      "fmul v15.4s, v15.4s, v0.s[0]\n"
+      "mov v4.16b, v27.16b\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "ldr q22, [x27, x21]\n"
+      "fadd v18.4s, v18.4s, v2.4s\n"
+      "fadd v19.4s, v19.4s, v2.4s\n"
+      "fadd v17.4s, v17.4s, v30.4s\n"
+      "fmla v4.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v28.4s, v22.4s\n"
+      "fsub v27.4s, v28.4s, v22.4s\n"
+      "fadd v12.4s, v12.4s, v20.4s\n"
+      "ldr q29, [x27, x22]\n"
+      "str q18, [%[outptr0]]\n"
+      "fadd v22.4s, v16.4s, v23.4s\n"
+      "str q19, [x28]\n"
+      "fadd v28.4s, v24.4s, v25.4s\n"
+      "ldr q30, [x27, x23]\n"
+      "fadd v20.4s, v29.4s, v30.4s\n"
+      "fsub v18.4s, v29.4s, v30.4s\n"
+      "mov v21.16b, v25.16b\n"
+      "ldr q25, [x27, x24]\n"
+      "fmla v16.4s, v23.4s, v0.s[1]\n"
+      "ldr q19, [%[inptr0]]\n"
+      "fadd v17.4s, v17.4s, v2.4s\n"
+      "add x27, x27, #16\n"
+      "fadd v28.4s, v28.4s, v20.4s\n"
+      "fmul v18.4s, v18.4s, v0.s[0]\n"
+      "fmla v21.4s, v20.4s, v0.s[1]\n"
+      "ldr q20, [%[inptr0], %[in_col_stride1]]\n"
+      "fadd v22.4s, v22.4s, v2.4s\n"
+      "fadd v4.4s, v4.4s, v2.4s\n"
+      "str q17, [%[outptr0], x15]\n"
+      "mov v24.16b, v27.16b\n"
+      "fadd v23.4s, v27.4s, v18.4s\n"
+      "fadd v16.4s, v16.4s, v28.4s\n"
+      "fadd v13.4s, v14.4s, v26.4s\n"
+      "fsub v30.4s, v14.4s, v26.4s\n"
+      "str q22, [x17]\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "str q4, [x28, x15]\n"
+      "mov v14.16b, v8.16b\n"
+      "fadd v29.4s, v11.4s, v15.4s\n"
+      "ldr q4, [%[inptr0], x21]\n"
+      "fadd v10.4s, v10.4s, v13.4s\n"
+      "ldr q17, [%[inptr0], x22]\n"
+      "fadd v24.4s, v24.4s, v25.4s\n"
+      "ldr q22, [%[inptr0], x23]\n"
+      "fmul v30.4s, v30.4s, v0.s[0]\n"
+      "fmla v7.4s, v13.4s, v0.s[1]\n"
+      "mov v26.16b, v11.16b\n"
+      "fadd v13.4s, v3.4s, v6.4s\n"
+      "fsub v3.4s, v3.4s, v6.4s\n"
+      "ldr q18, [%[inptr0], x24]\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "fadd v29.4s, v29.4s, v2.4s\n"
+      "fadd v8.4s, v8.4s, v30.4s\n"
+      "fmla v14.4s, v30.4s, v0.s[1]\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "ldr q25, [x25]\n"
+      "fadd v27.4s, v9.4s, v12.4s\n"
+      "fadd v1.4s, v1.4s, v13.4s\n"
+      "str q10, [%[outptr0], %[output_col_stride1]]\n"
+      "fsub v6.4s, v9.4s, v12.4s\n"
+      "str q29, [x17, x15]\n"
+      "fadd v14.4s, v14.4s, v23.4s\n"
+      "fadd v26.4s, v26.4s, v21.4s\n"
+      "ldr q12, [x25, %[in_col_stride1]]\n"
+      "fadd v1.4s, v1.4s, v27.4s\n"
+      "ldr q23, [x25, x21]\n"
+      "fmul v6.4s, v6.4s, v0.s[0]\n"
+      "ldr q9, [x25, x22]\n"
+      "mov v5.16b, v13.16b\n"
+      "ldr q11, [x25, x23]\n"
+      "mov v13.16b, v3.16b\n"
+      "fadd v8.4s, v8.4s, v2.4s\n"
+      "fadd v1.4s, v1.4s, v2.4s\n"
+      "fadd v7.4s, v7.4s, v2.4s\n"
+      "fadd v10.4s, v3.4s, v6.4s\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x25, x24]\n"
+      "str q8, [x17, %[output_col_stride1]]\n"
+      "fadd v16.4s, v16.4s, v2.4s\n"
+      "str q1, [%[outptr0], x16]\n"
+      "fadd v14.4s, v14.4s, v2.4s\n"
+      "str q7, [x28, %[output_col_stride1]]\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "fadd v13.4s, v13.4s, v24.4s\n"
+      "add %[outptr0], %[outptr0], #16\n"
+      "str q16, [x18]\n"
+      "fadd v5.4s, v5.4s, v2.4s\n"
+      "str q14, [x18, %[output_col_stride1]]\n"
+      "fadd v26.4s, v26.4s, v2.4s\n"
+      "str q10, [x17, x16]\n"
+      "fadd v1.4s, v20.4s, v4.4s\n"
+      "fadd v13.4s, v13.4s, v2.4s\n"
+      "add x17, x17, #16\n"
+      "str q5, [x28, x16]\n"
+      "fadd v5.4s, v17.4s, v22.4s\n"
+      "str q26, [x18, x15]\n"
+      "fsub v7.4s, v20.4s, v4.4s\n"
+      "fadd v8.4s, v19.4s, v1.4s\n"
+      "add x28, x28, #16\n"
+      "str q13, [x18, x16]\n"
+      "mov v4.16b, v1.16b\n"
+      "fsub v10.4s, v17.4s, v22.4s\n"
+      "add x18, x18, #16\n"
+      "mov v1.16b, v7.16b\n"
+      "fadd v8.4s, v8.4s, v5.4s\n"
+      "fmla v4.4s, v5.4s, v0.s[1]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "fmla v1.4s, v10.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "bne 2b\n"
+      "3:\n"
+      "fadd v3.4s, v12.4s, v23.4s\n"
+      "ldr q2, [x13]\n"
+      "fadd v27.4s, v9.4s, v11.4s\n"
+      "ldr q21, [x13, %[in_col_stride1]]\n"
+      "fsub v16.4s, v12.4s, v23.4s\n"
+      "ldr q26, [x13, x21]\n"
+      "fsub v9.4s, v9.4s, v11.4s\n"
+      "ldr q17, [x13, x22]\n"
+      "fadd v14.4s, v25.4s, v3.4s\n"
+      "ldr q19, [x13, x23]\n"
+      "mov v11.16b, v3.16b\n"
+      "ldr q10, [x13, x24]\n"
+      "mov v3.16b, v16.16b\n"
+      "ldr q15, [x26]\n"
+      "fmul v9.4s, v9.4s, v0.s[0]\n"
+      "ldr q12, [x26, %[in_col_stride1]]\n"
+      "fadd v14.4s, v14.4s, v27.4s\n"
+      "ldr q20, [x26, x21]\n"
+      "fmla v11.4s, v27.4s, v0.s[1]\n"
+      "ldr q24, [x26, x22]\n"
+      "fadd v23.4s, v21.4s, v26.4s\n"
+      "ldr q29, [x26, x23]\n"
+      "fadd v13.4s, v16.4s, v9.4s\n"
+      "ldr q5, [x26, x24]\n"
+      "fmla v3.4s, v9.4s, v0.s[1]\n"
+      "ldr q18, [x14]\n"
+      "fadd v30.4s, v17.4s, v19.4s\n"
+      "add %[inptr0], %[inptr0], #16\n"
+      "fadd v16.4s, v2.4s, v23.4s\n"
+      "add x25, x25, #16\n"
+      "fsub v21.4s, v21.4s, v26.4s\n"
+      "ldr q22, [x14, %[in_col_stride1]]\n"
+      "fadd v3.4s, v3.4s, v6.4s\n"
+      "ldr q28, [x14, x21]\n"
+      "fsub v19.4s, v17.4s, v19.4s\n"
+      "add x13, x13, #16\n"
+      "fadd v16.4s, v16.4s, v30.4s\n"
+      "add x26, x26, #16\n"
+      "mov v17.16b, v23.16b\n"
+      "fadd v26.4s, v12.4s, v20.4s\n"
+      "fsub v9.4s, v12.4s, v20.4s\n"
+      "ldr q2, [%[bptr]]\n"
+      "fmul v19.4s, v19.4s, v0.s[0]\n"
+      "add %[bptr], %[bptr], #16\n"
+      "fmla v17.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v24.4s, v29.4s\n"
+      "fadd v23.4s, v15.4s, v26.4s\n"
+      "fsub v12.4s, v24.4s, v29.4s\n"
+      "mov v15.16b, v26.16b\n"
+      "fadd v24.4s, v22.4s, v28.4s\n"
+      "fsub v22.4s, v22.4s, v28.4s\n"
+      "fadd v29.4s, v14.4s, v16.4s\n"
+      "fsub v16.4s, v14.4s, v16.4s\n"
+      "ldr q20, [x14, x22]\n"
+      "fadd v23.4s, v23.4s, v25.4s\n"
+      "fmul v12.4s, v12.4s, v0.s[0]\n"
+      "fmla v15.4s, v25.4s, v0.s[1]\n"
+      "mov v6.16b, v21.16b\n"
+      "fadd v30.4s, v21.4s, v19.4s\n"
+      "fadd v26.4s, v18.4s, v24.4s\n"
+      "mov v25.16b, v24.16b\n"
+      "fadd v18.4s, v8.4s, v29.4s\n"
+      "fmla v6.4s, v19.4s, v0.s[1]\n"
+      "mov v19.16b, v29.16b\n"
+      "fadd v27.4s, v11.4s, v17.4s\n"
+      "fsub v11.4s, v11.4s, v17.4s\n"
+      "fadd v29.4s, v13.4s, v30.4s\n"
+      "fsub v8.4s, v13.4s, v30.4s\n"
+      "fadd v14.4s, v9.4s, v12.4s\n"
+      "fadd v6.4s, v6.4s, v10.4s\n"
+      "ldr q28, [x14, x23]\n"
+      "fadd v17.4s, v4.4s, v27.4s\n"
+      "mov v4.16b, v27.16b\n"
+      "fmla v9.4s, v12.4s, v0.s[1]\n"
+      "fadd v27.4s, v20.4s, v28.4s\n"
+      "fsub v21.4s, v20.4s, v28.4s\n"
+      "fadd v10.4s, v7.4s, v29.4s\n"
+      "mov v7.16b, v29.16b\n"
+      "fadd v13.4s, v3.4s, v6.4s\n"
+      "fsub v3.4s, v3.4s, v6.4s\n"
+      "ldr q20, [x14, x24]\n"
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "fadd v26.4s, v26.4s, v27.4s\n"
+      "fmul v21.4s, v21.4s, v0.s[0]\n"
+      "add x14, x14, #16\n"
+      "fmla v25.4s, v27.4s, v0.s[1]\n"
+      "mov v12.16b, v22.16b\n"
+      "fadd v1.4s, v1.4s, v13.4s\n"
+      "mov v5.16b, v13.16b\n"
+      "fadd v13.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "fadd v26.4s, v22.4s, v21.4s\n"
+      "ldr q24, [x27]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "fadd v30.4s, v15.4s, v25.4s\n"
+      "fsub v15.4s, v15.4s, v25.4s\n"
+      "ldr q28, [x27, %[in_col_stride1]]\n"
+      "fadd v18.4s, v18.4s, v13.4s\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fmla v19.4s, v13.4s, v0.s[1]\n"
+      "ldr q22, [x27, x21]\n"
+      "fadd v12.4s, v12.4s, v20.4s\n"
+      "ldr q29, [x27, x22]\n"
+      "fadd v17.4s, v17.4s, v30.4s\n"
+      "fmul v15.4s, v15.4s, v0.s[0]\n"
+      "fmla v4.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v28.4s, v22.4s\n"
+      "fsub v27.4s, v28.4s, v22.4s\n"
+      "fadd v22.4s, v16.4s, v23.4s\n"
+      "fadd v18.4s, v18.4s, v2.4s\n"
+      "fadd v17.4s, v17.4s, v2.4s\n"
+      "fadd v19.4s, v19.4s, v2.4s\n"
+      "fadd v28.4s, v24.4s, v25.4s\n"
+      "mov v21.16b, v25.16b\n"
+      "fmla v16.4s, v23.4s, v0.s[1]\n"
+      "ldr q30, [x27, x23]\n"
+      "str q18, [%[outptr0]]\n"
+      "fadd v20.4s, v29.4s, v30.4s\n"
+      "str q17, [%[outptr0], x15]\n"
+      "fsub v18.4s, v29.4s, v30.4s\n"
+      "str q19, [x28]\n"
+      "mov v24.16b, v27.16b\n"
+      "fadd v13.4s, v14.4s, v26.4s\n"
+      "ldr q25, [x27, x24]\n"
+      "fadd v28.4s, v28.4s, v20.4s\n"
+      "add x27, x27, #16\n"
+      "fmul v18.4s, v18.4s, v0.s[0]\n"
+      "fmla v21.4s, v20.4s, v0.s[1]\n"
+      "fsub v30.4s, v14.4s, v26.4s\n"
+      "mov v14.16b, v8.16b\n"
+      "fadd v10.4s, v10.4s, v13.4s\n"
+      "fmla v7.4s, v13.4s, v0.s[1]\n"
+      "fadd v16.4s, v16.4s, v28.4s\n"
+      "fadd v29.4s, v11.4s, v15.4s\n"
+      "fadd v23.4s, v27.4s, v18.4s\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "fmul v30.4s, v30.4s, v0.s[0]\n"
+      "mov v26.16b, v11.16b\n"
+      "fadd v27.4s, v9.4s, v12.4s\n"
+      "fsub v6.4s, v9.4s, v12.4s\n"
+      "mov v13.16b, v3.16b\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "fadd v24.4s, v24.4s, v25.4s\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "fadd v8.4s, v8.4s, v30.4s\n"
+      "fmla v14.4s, v30.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v27.4s\n"
+      "fmul v6.4s, v6.4s, v0.s[0]\n"
+      "str q10, [%[outptr0], %[output_col_stride1]]\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "fadd v26.4s, v26.4s, v21.4s\n"
+      "fadd v22.4s, v22.4s, v2.4s\n"
+      "fadd v14.4s, v14.4s, v23.4s\n"
+      "fadd v8.4s, v8.4s, v2.4s\n"
+      "fadd v10.4s, v3.4s, v6.4s\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v2.4s\n"
+      "fadd v29.4s, v29.4s, v2.4s\n"
+      "str q22, [x17]\n"
+      "fadd v7.4s, v7.4s, v2.4s\n"
+      "str q8, [x17, %[output_col_stride1]]\n"
+      "fadd v4.4s, v4.4s, v2.4s\n"
+      "fadd v13.4s, v13.4s, v24.4s\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "str q1, [%[outptr0], x16]\n"
+      "fadd v5.4s, v5.4s, v2.4s\n"
+      "str q29, [x17, x15]\n"
+      "fadd v16.4s, v16.4s, v2.4s\n"
+      "str q7, [x28, %[output_col_stride1]]\n"
+      "fadd v14.4s, v14.4s, v2.4s\n"
+      "str q10, [x17, x16]\n"
+      "fadd v26.4s, v26.4s, v2.4s\n"
+      "str q4, [x28, x15]\n"
+      "fadd v13.4s, v13.4s, v2.4s\n"
+      "str q5, [x28, x16]\n"
+      "add %[outptr0], %[outptr0], #16\n"
+      "str q16, [x18]\n"
+      "add x17, x17, #16\n"
+      "str q14, [x18, %[output_col_stride1]]\n"
+      "add x28, x28, #16\n"
+      "str q26, [x18, x15]\n"
+      "str q13, [x18, x16]\n"
+      "add x18, x18, #16\n"
+      "4:\n"
+      "cmp x20, #2\n"
+      "blt 5f\n"
+      "ldr d19, [%[inptr0]]\n"
+      "ldr d20, [%[inptr0], %[in_col_stride1]]\n"
+      "sub x20, x20, #2\n"
+      "ldr d4, [%[inptr0], x21]\n"
+      "ldr d17, [%[inptr0], x22]\n"
+      "fadd v1.4s, v20.4s, v4.4s\n"
+      "ldr d22, [%[inptr0], x23]\n"
+      "fadd v5.4s, v17.4s, v22.4s\n"
+      "ldr d18, [%[inptr0], x24]\n"
+      "fsub v7.4s, v20.4s, v4.4s\n"
+      "ldr d25, [x25]\n"
+      "fsub v10.4s, v17.4s, v22.4s\n"
+      "ldr d12, [x25, %[in_col_stride1]]\n"
+      "fadd v8.4s, v19.4s, v1.4s\n"
+      "ldr d23, [x25, x21]\n"
+      "mov v4.16b, v1.16b\n"
+      "ldr d9, [x25, x22]\n"
+      "mov v1.16b, v7.16b\n"
+      "ldr d11, [x25, x23]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "ldr d6, [x25, x24]\n"
+      "fadd v8.4s, v8.4s, v5.4s\n"
+      "ldr d2, [x13]\n"
+      "fmla v4.4s, v5.4s, v0.s[1]\n"
+      "ldr d21, [x13, %[in_col_stride1]]\n"
+      "fadd v3.4s, v12.4s, v23.4s\n"
+      "ldr d26, [x13, x21]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "ldr d17, [x13, x22]\n"
+      "fmla v1.4s, v10.4s, v0.s[1]\n"
+      "ldr d19, [x13, x23]\n"
+      "fadd v27.4s, v9.4s, v11.4s\n"
+      "ldr d10, [x13, x24]\n"
+      "fadd v14.4s, v25.4s, v3.4s\n"
+      "ldr d15, [x26]\n"
+      "fsub v16.4s, v12.4s, v23.4s\n"
+      "ldr d12, [x26, %[in_col_stride1]]\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "ldr d20, [x26, x21]\n"
+      "fsub v9.4s, v9.4s, v11.4s\n"
+      "ldr d24, [x26, x22]\n"
+      "fadd v14.4s, v14.4s, v27.4s\n"
+      "ldr d29, [x26, x23]\n"
+      "mov v11.16b, v3.16b\n"
+      "ldr d5, [x26, x24]\n"
+      "mov v3.16b, v16.16b\n"
+      "ldr d18, [x14]\n"
+      "fmul v9.4s, v9.4s, v0.s[0]\n"
+      "add %[inptr0], %[inptr0], #8\n"
+      "fmla v11.4s, v27.4s, v0.s[1]\n"
+      "add x25, x25, #8\n"
+      "fadd v23.4s, v21.4s, v26.4s\n"
+      "add x13, x13, #8\n"
+      "fsub v21.4s, v21.4s, v26.4s\n"
+      "ldr d22, [x14, %[in_col_stride1]]\n"
+      "fadd v13.4s, v16.4s, v9.4s\n"
+      "add x26, x26, #8\n"
+      "fmla v3.4s, v9.4s, v0.s[1]\n"
+      "fadd v30.4s, v17.4s, v19.4s\n"
+      "fadd v16.4s, v2.4s, v23.4s\n"
+      "fsub v19.4s, v17.4s, v19.4s\n"
+      "mov v17.16b, v23.16b\n"
+      "fadd v26.4s, v12.4s, v20.4s\n"
+      "fsub v9.4s, v12.4s, v20.4s\n"
+      "ldr d28, [x14, x21]\n"
+      "fadd v3.4s, v3.4s, v6.4s\n"
+      "ldr d20, [x14, x22]\n"
+      "fadd v16.4s, v16.4s, v30.4s\n"
+      "fmul v19.4s, v19.4s, v0.s[0]\n"
+      "fmla v17.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v24.4s, v29.4s\n"
+      "fadd v23.4s, v15.4s, v26.4s\n"
+      "fsub v12.4s, v24.4s, v29.4s\n"
+      "mov v15.16b, v26.16b\n"
+      "fadd v24.4s, v22.4s, v28.4s\n"
+      "fsub v22.4s, v22.4s, v28.4s\n"
+      "fadd v29.4s, v14.4s, v16.4s\n"
+      "fsub v16.4s, v14.4s, v16.4s\n"
+      "ldr d28, [x14, x23]\n"
+      "fadd v23.4s, v23.4s, v25.4s\n"
+      "fmul v12.4s, v12.4s, v0.s[0]\n"
+      "fmla v15.4s, v25.4s, v0.s[1]\n"
+      "mov v6.16b, v21.16b\n"
+      "fadd v30.4s, v21.4s, v19.4s\n"
+      "fadd v26.4s, v18.4s, v24.4s\n"
+      "mov v25.16b, v24.16b\n"
+      "fadd v18.4s, v8.4s, v29.4s\n"
+      "fmla v6.4s, v19.4s, v0.s[1]\n"
+      "fadd v27.4s, v20.4s, v28.4s\n"
+      "fsub v21.4s, v20.4s, v28.4s\n"
+      "mov v19.16b, v29.16b\n"
+      "fadd v29.4s, v13.4s, v30.4s\n"
+      "fsub v8.4s, v13.4s, v30.4s\n"
+      "fadd v14.4s, v9.4s, v12.4s\n"
+      "fadd v6.4s, v6.4s, v10.4s\n"
+      "ldr d20, [x14, x24]\n"
+      "fadd v26.4s, v26.4s, v27.4s\n"
+      "add x14, x14, #8\n"
+      "fmla v9.4s, v12.4s, v0.s[1]\n"
+      "ldr d24, [x27]\n"
+      "fmul v21.4s, v21.4s, v0.s[0]\n"
+      "fmla v25.4s, v27.4s, v0.s[1]\n"
+      "fadd v10.4s, v7.4s, v29.4s\n"
+      "ldr d2, [%[bptr]]\n"
+      "mov v7.16b, v29.16b\n"
+      "add %[bptr], %[bptr], #8\n"
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "fadd v13.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "fadd v27.4s, v11.4s, v17.4s\n"
+      "fsub v11.4s, v11.4s, v17.4s\n"
+      "fadd v30.4s, v15.4s, v25.4s\n"
+      "fsub v15.4s, v15.4s, v25.4s\n"
+      "ldr d28, [x27, %[in_col_stride1]]\n"
+      "fadd v18.4s, v18.4s, v13.4s\n"
+      "fmla v19.4s, v13.4s, v0.s[1]\n"
+      "fadd v26.4s, v22.4s, v21.4s\n"
+      "mov v12.16b, v22.16b\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fadd v17.4s, v4.4s, v27.4s\n"
+      "fmul v15.4s, v15.4s, v0.s[0]\n"
+      "mov v4.16b, v27.16b\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "ldr d22, [x27, x21]\n"
+      "fadd v18.4s, v18.4s, v2.4s\n"
+      "fadd v19.4s, v19.4s, v2.4s\n"
+      "fadd v17.4s, v17.4s, v30.4s\n"
+      "fmla v4.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v28.4s, v22.4s\n"
+      "fsub v27.4s, v28.4s, v22.4s\n"
+      "fadd v12.4s, v12.4s, v20.4s\n"
+      "ldr d29, [x27, x22]\n"
+      "str d18, [%[outptr0]]\n"
+      "fadd v22.4s, v16.4s, v23.4s\n"
+      "str d19, [x28]\n"
+      "fadd v28.4s, v24.4s, v25.4s\n"
+      "ldr d30, [x27, x23]\n"
+      "fadd v20.4s, v29.4s, v30.4s\n"
+      "fsub v18.4s, v29.4s, v30.4s\n"
+      "mov v21.16b, v25.16b\n"
+      "ldr d25, [x27, x24]\n"
+      "fmla v16.4s, v23.4s, v0.s[1]\n"
+      "add x27, x27, #8\n"
+      "mov v24.16b, v27.16b\n"
+      "fadd v17.4s, v17.4s, v2.4s\n"
+      "fadd v28.4s, v28.4s, v20.4s\n"
+      "fmul v18.4s, v18.4s, v0.s[0]\n"
+      "fmla v21.4s, v20.4s, v0.s[1]\n"
+      "fadd v13.4s, v14.4s, v26.4s\n"
+      "fsub v30.4s, v14.4s, v26.4s\n"
+      "mov v14.16b, v8.16b\n"
+      "str d17, [%[outptr0], x15]\n"
+      "fadd v29.4s, v11.4s, v15.4s\n"
+      "fadd v23.4s, v27.4s, v18.4s\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "fadd v16.4s, v16.4s, v28.4s\n"
+      "fadd v10.4s, v10.4s, v13.4s\n"
+      "fmul v30.4s, v30.4s, v0.s[0]\n"
+      "fmla v7.4s, v13.4s, v0.s[1]\n"
+      "mov v26.16b, v11.16b\n"
+      "fadd v13.4s, v3.4s, v6.4s\n"
+      "fadd v24.4s, v24.4s, v25.4s\n"
+      "fadd v27.4s, v9.4s, v12.4s\n"
+      "fsub v3.4s, v3.4s, v6.4s\n"
+      "fsub v6.4s, v9.4s, v12.4s\n"
+      "fadd v8.4s, v8.4s, v30.4s\n"
+      "fmla v14.4s, v30.4s, v0.s[1]\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v13.4s\n"
+      "mov v5.16b, v13.16b\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "fmul v6.4s, v6.4s, v0.s[0]\n"
+      "mov v13.16b, v3.16b\n"
+      "fadd v14.4s, v14.4s, v23.4s\n"
+      "fadd v22.4s, v22.4s, v2.4s\n"
+      "fadd v26.4s, v26.4s, v21.4s\n"
+      "fadd v1.4s, v1.4s, v27.4s\n"
+      "str d10, [%[outptr0], %[output_col_stride1]]\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "fadd v10.4s, v3.4s, v6.4s\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "str d22, [x17]\n"
+      "fadd v8.4s, v8.4s, v2.4s\n"
+      "fadd v1.4s, v1.4s, v2.4s\n"
+      "fadd v29.4s, v29.4s, v2.4s\n"
+      "fadd v7.4s, v7.4s, v2.4s\n"
+      "fadd v4.4s, v4.4s, v2.4s\n"
+      "fadd v13.4s, v13.4s, v24.4s\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "str d8, [x17, %[output_col_stride1]]\n"
+      "fadd v5.4s, v5.4s, v2.4s\n"
+      "str d1, [%[outptr0], x16]\n"
+      "fadd v16.4s, v16.4s, v2.4s\n"
+      "str d29, [x17, x15]\n"
+      "fadd v14.4s, v14.4s, v2.4s\n"
+      "str d10, [x17, x16]\n"
+      "fadd v26.4s, v26.4s, v2.4s\n"
+      "str d7, [x28, %[output_col_stride1]]\n"
+      "fadd v13.4s, v13.4s, v2.4s\n"
+      "str d4, [x28, x15]\n"
+      "add %[outptr0], %[outptr0], #8\n"
+      "str d5, [x28, x16]\n"
+      "add x17, x17, #8\n"
+      "str d16, [x18]\n"
+      "add x28, x28, #8\n"
+      "str d14, [x18, %[output_col_stride1]]\n"
+      "str d26, [x18, x15]\n"
+      "str d13, [x18, x16]\n"
+      "add x18, x18, #8\n"
+      "5:\n"
+      "cbz x20, 6f\n"
+      "ldr s19, [%[inptr0]]\n"
+      "ldr s20, [%[inptr0], %[in_col_stride1]]\n"
+      "ldr s4, [%[inptr0], x21]\n"
+      "fadd v1.4s, v20.4s, v4.4s\n"
+      "ldr s17, [%[inptr0], x22]\n"
+      "fsub v7.4s, v20.4s, v4.4s\n"
+      "ldr s22, [%[inptr0], x23]\n"
+      "fadd v5.4s, v17.4s, v22.4s\n"
+      "ldr s18, [%[inptr0], x24]\n"
+      "fsub v10.4s, v17.4s, v22.4s\n"
+      "ldr s25, [x25]\n"
+      "fadd v8.4s, v19.4s, v1.4s\n"
+      "ldr s12, [x25, %[in_col_stride1]]\n"
+      "mov v4.16b, v1.16b\n"
+      "ldr s23, [x25, x21]\n"
+      "mov v1.16b, v7.16b\n"
+      "ldr s9, [x25, x22]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "ldr s11, [x25, x23]\n"
+      "fadd v8.4s, v8.4s, v5.4s\n"
+      "ldr s6, [x25, x24]\n"
+      "fmla v4.4s, v5.4s, v0.s[1]\n"
+      "ldr s2, [x13]\n"
+      "fadd v3.4s, v12.4s, v23.4s\n"
+      "ldr s21, [x13, %[in_col_stride1]]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "ldr s26, [x13, x21]\n"
+      "fmla v1.4s, v10.4s, v0.s[1]\n"
+      "ldr s17, [x13, x22]\n"
+      "fadd v27.4s, v9.4s, v11.4s\n"
+      "ldr s19, [x13, x23]\n"
+      "fadd v14.4s, v25.4s, v3.4s\n"
+      "ldr s10, [x13, x24]\n"
+      "fsub v16.4s, v12.4s, v23.4s\n"
+      "ldr s15, [x26]\n"
+      "fadd v1.4s, v1.4s, v18.4s\n"
+      "ldr s12, [x26, %[in_col_stride1]]\n"
+      "fsub v9.4s, v9.4s, v11.4s\n"
+      "ldr s20, [x26, x21]\n"
+      "fadd v14.4s, v14.4s, v27.4s\n"
+      "ldr s24, [x26, x22]\n"
+      "mov v11.16b, v3.16b\n"
+      "ldr s29, [x26, x23]\n"
+      "mov v3.16b, v16.16b\n"
+      "ldr s5, [x26, x24]\n"
+      "fmul v9.4s, v9.4s, v0.s[0]\n"
+      "ldr s18, [x14]\n"
+      "fmla v11.4s, v27.4s, v0.s[1]\n"
+      "fadd v23.4s, v21.4s, v26.4s\n"
+      "fsub v21.4s, v21.4s, v26.4s\n"
+      "fadd v30.4s, v17.4s, v19.4s\n"
+      "fsub v19.4s, v17.4s, v19.4s\n"
+      "ldr s22, [x14, %[in_col_stride1]]\n"
+      "fadd v13.4s, v16.4s, v9.4s\n"
+      "fmla v3.4s, v9.4s, v0.s[1]\n"
+      "fadd v16.4s, v2.4s, v23.4s\n"
+      "mov v17.16b, v23.16b\n"
+      "fadd v26.4s, v12.4s, v20.4s\n"
+      "fsub v9.4s, v12.4s, v20.4s\n"
+      "fmul v19.4s, v19.4s, v0.s[0]\n"
+      "ldr s28, [x14, x21]\n"
+      "fadd v3.4s, v3.4s, v6.4s\n"
+      "ldr s20, [x14, x22]\n"
+      "fadd v16.4s, v16.4s, v30.4s\n"
+      "fmla v17.4s, v30.4s, v0.s[1]\n"
+      "fadd v25.4s, v24.4s, v29.4s\n"
+      "fadd v23.4s, v15.4s, v26.4s\n"
+      "fsub v12.4s, v24.4s, v29.4s\n"
+      "mov v15.16b, v26.16b\n"
+      "fadd v24.4s, v22.4s, v28.4s\n"
+      "fsub v22.4s, v22.4s, v28.4s\n"
+      "fadd v30.4s, v21.4s, v19.4s\n"
+      "mov v6.16b, v21.16b\n"
+      "fadd v23.4s, v23.4s, v25.4s\n"
+      "fmla v15.4s, v25.4s, v0.s[1]\n"
+      "fmul v12.4s, v12.4s, v0.s[0]\n"
+      "ldr s28, [x14, x23]\n"
+      "fmla v6.4s, v19.4s, v0.s[1]\n"
+      "fadd v27.4s, v20.4s, v28.4s\n"
+      "fadd v26.4s, v18.4s, v24.4s\n"
+      "fsub v21.4s, v20.4s, v28.4s\n"
+      "mov v25.16b, v24.16b\n"
+      "fadd v29.4s, v14.4s, v16.4s\n"
+      "fsub v16.4s, v14.4s, v16.4s\n"
+      "ldr s20, [x14, x24]\n"
+      "fadd v6.4s, v6.4s, v10.4s\n"
+      "ldr s24, [x27]\n"
+      "fadd v26.4s, v26.4s, v27.4s\n"
+      "fmul v21.4s, v21.4s, v0.s[0]\n"
+      "fmla v25.4s, v27.4s, v0.s[1]\n"
+      "fadd v18.4s, v8.4s, v29.4s\n"
+      "mov v19.16b, v29.16b\n"
+      "fadd v29.4s, v13.4s, v30.4s\n"
+      "fsub v8.4s, v13.4s, v30.4s\n"
+      "fadd v27.4s, v11.4s, v17.4s\n"
+      "fsub v11.4s, v11.4s, v17.4s\n"
+      "fadd v13.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "ldr s28, [x27, %[in_col_stride1]]\n"
+      "fadd v10.4s, v7.4s, v29.4s\n"
+      "mov v7.16b, v29.16b\n"
+      "fadd v17.4s, v4.4s, v27.4s\n"
+      "mov v4.16b, v27.16b\n"
+      "fadd v18.4s, v18.4s, v13.4s\n"
+      "fmla v19.4s, v13.4s, v0.s[1]\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fadd v30.4s, v15.4s, v25.4s\n"
+      "fsub v15.4s, v15.4s, v25.4s\n"
+      "fadd v13.4s, v3.4s, v6.4s\n"
+      "fsub v3.4s, v3.4s, v6.4s\n"
+      "ldr s2, [%[bptr]]\n"
+      "fadd v18.4s, v18.4s, v2.4s\n"
+      "fadd v19.4s, v19.4s, v2.4s\n"
+      "fadd v17.4s, v17.4s, v30.4s\n"
+      "fmla v4.4s, v30.4s, v0.s[1]\n"
+      "fadd v14.4s, v9.4s, v12.4s\n"
+      "fmul v15.4s, v15.4s, v0.s[0]\n"
+      "fadd v1.4s, v1.4s, v13.4s\n"
+      "str s18, [%[outptr0]]\n"
+      "fadd v26.4s, v22.4s, v21.4s\n"
+      "str s19, [x28]\n"
+      "fmla v9.4s, v12.4s, v0.s[1]\n"
+      "mov v12.16b, v22.16b\n"
+      "ldr s22, [x27, x21]\n"
+      "fadd v25.4s, v28.4s, v22.4s\n"
+      "fsub v27.4s, v28.4s, v22.4s\n"
+      "fadd v22.4s, v16.4s, v23.4s\n"
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "ldr s29, [x27, x22]\n"
+      "fmla v12.4s, v21.4s, v0.s[1]\n"
+      "ldr s30, [x27, x23]\n"
+      "fadd v28.4s, v24.4s, v25.4s\n"
+      "mov v21.16b, v25.16b\n"
+      "fmla v16.4s, v23.4s, v0.s[1]\n"
+      "ldr s25, [x27, x24]\n"
+      "mov v5.16b, v13.16b\n"
+      "fadd v17.4s, v17.4s, v2.4s\n"
+      "fadd v12.4s, v12.4s, v20.4s\n"
+      "fadd v20.4s, v29.4s, v30.4s\n"
+      "fsub v18.4s, v29.4s, v30.4s\n"
+      "mov v24.16b, v27.16b\n"
+      "fadd v22.4s, v22.4s, v2.4s\n"
+      "fadd v4.4s, v4.4s, v2.4s\n"
+      "str s17, [%[outptr0], x15]\n"
+      "fadd v13.4s, v14.4s, v26.4s\n"
+      "fadd v28.4s, v28.4s, v20.4s\n"
+      "fmla v21.4s, v20.4s, v0.s[1]\n"
+      "fmul v18.4s, v18.4s, v0.s[0]\n"
+      "fsub v30.4s, v14.4s, v26.4s\n"
+      "str s22, [x17]\n"
+      "mov v14.16b, v8.16b\n"
+      "str s4, [x28, x15]\n"
+      "fadd v10.4s, v10.4s, v13.4s\n"
+      "fadd v16.4s, v16.4s, v28.4s\n"
+      "fmla v7.4s, v13.4s, v0.s[1]\n"
+      "fadd v23.4s, v27.4s, v18.4s\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "fmul v30.4s, v30.4s, v0.s[0]\n"
+      "fadd v29.4s, v11.4s, v15.4s\n"
+      "mov v26.16b, v11.16b\n"
+      "fadd v27.4s, v9.4s, v12.4s\n"
+      "fsub v6.4s, v9.4s, v12.4s\n"
+      "mov v13.16b, v3.16b\n"
+      "fadd v24.4s, v24.4s, v25.4s\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "fadd v8.4s, v8.4s, v30.4s\n"
+      "fmla v14.4s, v30.4s, v0.s[1]\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v27.4s\n"
+      "fmul v6.4s, v6.4s, v0.s[0]\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "str s10, [%[outptr0], %[output_col_stride1]]\n"
+      "fadd v29.4s, v29.4s, v2.4s\n"
+      "fadd v14.4s, v14.4s, v23.4s\n"
+      "fadd v8.4s, v8.4s, v2.4s\n"
+      "fadd v26.4s, v26.4s, v21.4s\n"
+      "fadd v1.4s, v1.4s, v2.4s\n"
+      "fadd v10.4s, v3.4s, v6.4s\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "str s29, [x17, x15]\n"
+      "fadd v7.4s, v7.4s, v2.4s\n"
+      "str s8, [x17, %[output_col_stride1]]\n"
+      "fadd v5.4s, v5.4s, v2.4s\n"
+      "str s1, [%[outptr0], x16]\n"
+      "fadd v16.4s, v16.4s, v2.4s\n"
+      "fadd v13.4s, v13.4s, v24.4s\n"
+      "fadd v10.4s, v10.4s, v2.4s\n"
+      "str s7, [x28, %[output_col_stride1]]\n"
+      "fadd v14.4s, v14.4s, v2.4s\n"
+      "str s5, [x28, x16]\n"
+      "fadd v26.4s, v26.4s, v2.4s\n"
+      "str s16, [x18]\n"
+      "fadd v13.4s, v13.4s, v2.4s\n"
+      "str s10, [x17, x16]\n"
+      "str s14, [x18, %[output_col_stride1]]\n"
+      "str s26, [x18, x15]\n"
+      "str s13, [x18, x16]\n"
+      "6:\n"
+      : [bptr] "+r" (bptr), [outptr0] "+r" (output), [inptr0] "+r" (inptr)
+      : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [pcoeffs] "r" (coeffs), [n_channels] "r" ((long) n_channels), [in_row_stride] "r" (6 * matrix_stride * sizeof(float)), [in_col_stride1] "r" (matrix_stride * sizeof(float))
+      : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+    );
+  }
+  else
+  {
+    __asm__ __volatile__ (
+      "ldr d0, [%[pcoeffs]]\n"
+      "add x21, %[in_col_stride1], %[in_col_stride1]\n"  // Compute input column stride 2
+      "add x22, x21, %[in_col_stride1]\n"  // Compute input column stride 3
+      "add x25, %[inptr0], %[in_row_stride]\n"  // Compute input row pointers
+      "add x15, %[output_col_stride1], %[output_col_stride1]\n"  // Compute output column stride 2
+      "add x23, x22, %[in_col_stride1]\n"  // Compute input column stride 4
+      "add x13, x25, %[in_row_stride]\n"  // Compute input row pointers
+      "add x16, x15, %[output_col_stride1]\n"  // Compute output column stride 3
+      "add x24, x23, %[in_col_stride1]\n"  // Compute input column stride 5
+      "add x26, x13, %[in_row_stride]\n"  // Compute input row pointers
+      "add x17, %[outptr0], %[output_row_stride]\n"  // Compute output row pointer 1
+      "add x14, x26, %[in_row_stride]\n"  // Compute input row pointers
+      "add x28, x17, %[output_row_stride]\n"  // Compute output row pointer 2
+      "lsr x19, %[n_channels], #2\n"
+      "add x27, x14, %[in_row_stride]\n"  // Compute input row pointers
+      "add x18, x28, %[output_row_stride]\n"  // Compute output row pointer 3
+      "and x20, %[n_channels], #3\n"
+      "cbz x19, 4f\n"
+      "1:\n"  // Quad head
+      "ldr q17, [%[inptr0]]\n"
+      "subs x19, x19, #1\n"
+      "ldr q23, [%[inptr0], %[in_col_stride1]]\n"
+      "ldr q27, [%[inptr0], x21]\n"
+      "fadd v4.4s, v23.4s, v27.4s\n"
+      "ldr q24, [%[inptr0], x22]\n"
+      "fsub v13.4s, v23.4s, v27.4s\n"
+      "ldr q11, [%[inptr0], x23]\n"
+      "fadd v10.4s, v24.4s, v11.4s\n"
+      "ldr q12, [%[inptr0], x24]\n"
+      "fsub v11.4s, v24.4s, v11.4s\n"
+      "ldr q20, [x25]\n"
+      "fadd v7.4s, v17.4s, v4.4s\n"
+      "ldr q19, [x25, %[in_col_stride1]]\n"
+      "mov v4.16b, v4.16b\n"
+      "ldr q22, [x25, x21]\n"
+      "mov v1.16b, v13.16b\n"
+      "ldr q14, [x25, x22]\n"
+      "fmul v11.4s, v11.4s, v0.s[0]\n"
+      "ldr q18, [x25, x23]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "ldr q3, [x25, x24]\n"
+      "fmla v4.4s, v10.4s, v0.s[1]\n"
+      "fadd v8.4s, v13.4s, v11.4s\n"
+      "fmla v1.4s, v11.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v12.4s\n"
+      "beq 3f\n"
+      "2:\n"  // Quad loop
+      "fadd v2.4s, v19.4s, v22.4s\n"
+      "ldr q16, [x13]\n"
+      "fadd v23.4s, v14.4s, v18.4s\n"
+      "ldr q21, [x13, %[in_col_stride1]]\n"
+      "fsub v15.4s, v19.4s, v22.4s\n"
+      "ldr q24, [x13, x21]\n"
+      "fsub v31.4s, v14.4s, v18.4s\n"
+      "ldr q25, [x13, x22]\n"
+      "fadd v11.4s, v20.4s, v2.4s\n"
+      "ldr q17, [x13, x23]\n"
+      "mov v13.16b, v2.16b\n"
+      "ldr q9, [x13, x24]\n"
+      "mov v2.16b, v15.16b\n"
+      "ldr q6, [x26]\n"
+      "fmul v31.4s, v31.4s, v0.s[0]\n"
+      "ldr q19, [x26, %[in_col_stride1]]\n"
+      "fadd v11.4s, v11.4s, v23.4s\n"
+      "ldr q22, [x26, x21]\n"
+      "fmla v13.4s, v23.4s, v0.s[1]\n"
+      "ldr q12, [x26, x22]\n"
+      "fadd v29.4s, v21.4s, v24.4s\n"
+      "ldr q26, [x26, x23]\n"
+      "fadd v15.4s, v15.4s, v31.4s\n"
+      "ldr q5, [x26, x24]\n"
+      "fmla v2.4s, v31.4s, v0.s[1]\n"
+      "ldr q10, [x14]\n"
+      "fadd v18.4s, v25.4s, v17.4s\n"
+      "add %[inptr0], %[inptr0], #16\n"
+      "fadd v27.4s, v16.4s, v29.4s\n"
+      "add x25, x25, #16\n"
+      "fsub v14.4s, v21.4s, v24.4s\n"
+      "ldr q30, [x14, %[in_col_stride1]]\n"
+      "fadd v2.4s, v2.4s, v3.4s\n"
+      "ldr q31, [x14, x21]\n"
+      "fsub v28.4s, v25.4s, v17.4s\n"
+      "add x13, x13, #16\n"
+      "fadd v27.4s, v27.4s, v18.4s\n"
+      "add x26, x26, #16\n"
+      "mov v21.16b, v29.16b\n"
+      "subs x19, x19, #1\n"
+      "fadd v20.4s, v19.4s, v22.4s\n"
+      "fsub v17.4s, v19.4s, v22.4s\n"
+      "fmul v28.4s, v28.4s, v0.s[0]\n"
+      "ldr q23, [x14, x22]\n"
+      "fmla v21.4s, v18.4s, v0.s[1]\n"
+      "fadd v29.4s, v12.4s, v26.4s\n"
+      "fsub v16.4s, v12.4s, v26.4s\n"
+      "fadd v25.4s, v30.4s, v31.4s\n"
+      "fadd v24.4s, v6.4s, v20.4s\n"
+      "mov v6.16b, v20.16b\n"
+      "fsub v22.4s, v30.4s, v31.4s\n"
+      "fadd v31.4s, v11.4s, v27.4s\n"
+      "fsub v12.4s, v11.4s, v27.4s\n"
+      "ldr q26, [x14, x23]\n"
+      "fmul v16.4s, v16.4s, v0.s[0]\n"
+      "fmla v6.4s, v29.4s, v0.s[1]\n"
+      "fadd v24.4s, v24.4s, v29.4s\n"
+      "mov v3.16b, v14.16b\n"
+      "fadd v20.4s, v14.4s, v28.4s\n"
+      "fadd v29.4s, v10.4s, v25.4s\n"
+      "mov v10.16b, v25.16b\n"
+      "fadd v25.4s, v7.4s, v31.4s\n"
+      "fmla v3.4s, v28.4s, v0.s[1]\n"
+      "fadd v14.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "mov v26.16b, v31.16b\n"
+      "fadd v31.4s, v15.4s, v20.4s\n"
+      "fsub v11.4s, v15.4s, v20.4s\n"
+      "fadd v20.4s, v17.4s, v16.4s\n"
+      "mov v7.16b, v17.16b\n"
+      "fadd v3.4s, v3.4s, v9.4s\n"
+      "ldr q18, [x14, x24]\n"
+      "fadd v29.4s, v29.4s, v14.4s\n"
+      "add x14, x14, #16\n"
+      "fmla v7.4s, v16.4s, v0.s[1]\n"
+      "ldr q19, [x27]\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fmla v10.4s, v14.4s, v0.s[1]\n"
+      "fadd v15.4s, v8.4s, v31.4s\n"
+      "mov v14.16b, v31.16b\n"
+      "fadd v28.4s, v24.4s, v29.4s\n"
+      "fsub v24.4s, v24.4s, v29.4s\n"
+      "fadd v7.4s, v7.4s, v5.4s\n"
+      "ldr q27, [x27, %[in_col_stride1]]\n"
+      "fadd v30.4s, v13.4s, v21.4s\n"
+      "fsub v9.4s, v13.4s, v21.4s\n"
+      "fadd v17.4s, v22.4s, v23.4s\n"
+      "mov v8.16b, v22.16b\n"
+      "fadd v25.4s, v25.4s, v28.4s\n"
+      "fmul v24.4s, v24.4s, v0.s[0]\n"
+      "fmla v26.4s, v28.4s, v0.s[1]\n"
+      "ldr q29, [x27, x21]\n"
+      "fmla v8.4s, v23.4s, v0.s[1]\n"
+      "ldr q28, [x27, x22]\n"
+      "fadd v13.4s, v4.4s, v30.4s\n"
+      "mov v4.16b, v30.16b\n"
+      "str q25, [%[outptr0]]\n"  // Store output (0, 0)
+      "fadd v16.4s, v27.4s, v29.4s\n"
+      "str q26, [x28]\n"  // Store output (2, 0)
+      "fsub v29.4s, v27.4s, v29.4s\n"
+      "fadd v8.4s, v8.4s, v18.4s\n"
+      "ldr q23, [x27, x23]\n"
+      "fadd v30.4s, v28.4s, v23.4s\n"
+      "ldr q25, [x27, x24]\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "add x27, x27, #16\n"
+      "fsub v27.4s, v28.4s, v23.4s\n"
+      "mov v16.16b, v16.16b\n"
+      "fadd v22.4s, v20.4s, v17.4s\n"
+      "fsub v20.4s, v20.4s, v17.4s\n"
+      "fadd v21.4s, v12.4s, v24.4s\n"
+      "mov v26.16b, v12.16b\n"
+      "fadd v19.4s, v19.4s, v30.4s\n"
+      "fmla v16.4s, v30.4s, v0.s[1]\n"
+      "fmul v27.4s, v27.4s, v0.s[0]\n"
+      "ldr q17, [%[inptr0]]\n"
+      "fmla v26.4s, v24.4s, v0.s[1]\n"
+      "ldr q23, [%[inptr0], %[in_col_stride1]]\n"
+      "str q21, [x17]\n"  // Store output (1, 0)
+      "mov v5.16b, v29.16b\n"
+      "fadd v15.4s, v15.4s, v22.4s\n"
+      "fmul v20.4s, v20.4s, v0.s[0]\n"
+      "fadd v18.4s, v29.4s, v27.4s\n"
+      "fmla v14.4s, v22.4s, v0.s[1]\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "ldr q27, [%[inptr0], x21]\n"
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "ldr q24, [%[inptr0], x22]\n"
+      "str q15, [%[outptr0], %[output_col_stride1]]\n"  // Store output (0, 1)
+      "fadd v12.4s, v11.4s, v20.4s\n"
+      "str q14, [x28, %[output_col_stride1]]\n"  // Store output (2, 1)
+      "mov v28.16b, v11.16b\n"
+      "fadd v5.4s, v5.4s, v25.4s\n"
+      "ldr q11, [%[inptr0], x23]\n"
+      "str q26, [x18]\n"  // Store output (3, 0)
+      "fadd v21.4s, v6.4s, v10.4s\n"
+      "str q12, [x17, %[output_col_stride1]]\n"  // Store output (1, 1)
+      "fmla v28.4s, v20.4s, v0.s[1]\n"
+      "fsub v10.4s, v6.4s, v10.4s\n"
+      "ldr q12, [%[inptr0], x24]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q20, [x25]\n"
+      "fadd v13.4s, v13.4s, v21.4s\n"
+      "ldr q19, [x25, %[in_col_stride1]]\n"
+      "fadd v28.4s, v28.4s, v18.4s\n"
+      "ldr q22, [x25, x21]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "ldr q14, [x25, x22]\n"
+      "fmla v4.4s, v21.4s, v0.s[1]\n"
+      "ldr q18, [x25, x23]\n"
+      "str q13, [%[outptr0], x15]\n"  // Store output (0, 2)
+      "fadd v6.4s, v2.4s, v3.4s\n"
+      "str q28, [x18, %[output_col_stride1]]\n"  // Store output (3, 1)
+      "fadd v30.4s, v7.4s, v8.4s\n"
+      "fadd v13.4s, v9.4s, v10.4s\n"
+      "fmla v15.4s, v10.4s, v0.s[1]\n"
+      "str q4, [x28, x15]\n"  // Store output (2, 2)
+      "fsub v2.4s, v2.4s, v3.4s\n"
+      "fadd v1.4s, v1.4s, v6.4s\n"
+      "ldr q3, [x25, x24]\n"
+      "fsub v8.4s, v7.4s, v8.4s\n"
+      "mov v6.16b, v6.16b\n"
+      "str q13, [x17, x15]\n"  // Store output (1, 2)
+      "fadd v15.4s, v15.4s, v16.4s\n"
+      "mov v9.16b, v2.16b\n"
+      "fadd v4.4s, v23.4s, v27.4s\n"
+      "fadd v1.4s, v1.4s, v30.4s\n"
+      "fmla v6.4s, v30.4s, v0.s[1]\n"
+      "fmul v8.4s, v8.4s, v0.s[0]\n"
+      "fadd v10.4s, v24.4s, v11.4s\n"
+      "str q15, [x18, x15]\n"  // Store output (3, 2)
+      "fsub v13.4s, v23.4s, v27.4s\n"
+      "fadd v7.4s, v17.4s, v4.4s\n"
+      "fsub v11.4s, v24.4s, v11.4s\n"
+      "str q1, [%[outptr0], x16]\n"  // Store output (0, 3)
+      "mov v4.16b, v4.16b\n"
+      "str q6, [x28, x16]\n"  // Store output (2, 3)
+      "fadd v2.4s, v2.4s, v8.4s\n"
+      "fmla v9.4s, v8.4s, v0.s[1]\n"
+      "add %[outptr0], %[outptr0], #16\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "add x28, x28, #16\n"
+      "fmul v11.4s, v11.4s, v0.s[0]\n"
+      "fmla v4.4s, v10.4s, v0.s[1]\n"
+      "str q2, [x17, x16]\n"  // Store output (1, 3)
+      "mov v1.16b, v13.16b\n"
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "add x17, x17, #16\n"
+      "fadd v8.4s, v13.4s, v11.4s\n"
+      "fmla v1.4s, v11.4s, v0.s[1]\n"
+      "str q9, [x18, x16]\n"  // Store output (3, 3)
+      "add x18, x18, #16\n"
+      "fadd v1.4s, v1.4s, v12.4s\n"
+      "bne 2b\n"
+      "3:\n"  // Quad tail
+      "fadd v2.4s, v19.4s, v22.4s\n"
+      "ldr q16, [x13]\n"
+      "fadd v23.4s, v14.4s, v18.4s\n"
+      "ldr q21, [x13, %[in_col_stride1]]\n"
+      "fsub v15.4s, v19.4s, v22.4s\n"
+      "ldr q24, [x13, x21]\n"
+      "fsub v31.4s, v14.4s, v18.4s\n"
+      "ldr q25, [x13, x22]\n"
+      "fadd v11.4s, v20.4s, v2.4s\n"
+      "ldr q17, [x13, x23]\n"
+      "mov v13.16b, v2.16b\n"
+      "ldr q9, [x13, x24]\n"
+      "mov v2.16b, v15.16b\n"
+      "ldr q6, [x26]\n"
+      "fmul v31.4s, v31.4s, v0.s[0]\n"
+      "ldr q19, [x26, %[in_col_stride1]]\n"
+      "fadd v11.4s, v11.4s, v23.4s\n"
+      "ldr q22, [x26, x21]\n"
+      "fmla v13.4s, v23.4s, v0.s[1]\n"
+      "ldr q12, [x26, x22]\n"
+      "fadd v29.4s, v21.4s, v24.4s\n"
+      "ldr q26, [x26, x23]\n"
+      "fadd v15.4s, v15.4s, v31.4s\n"
+      "ldr q5, [x26, x24]\n"
+      "fmla v2.4s, v31.4s, v0.s[1]\n"
+      "ldr q10, [x14]\n"
+      "fadd v18.4s, v25.4s, v17.4s\n"
+      "add %[inptr0], %[inptr0], #16\n"
+      "fadd v27.4s, v16.4s, v29.4s\n"
+      "add x25, x25, #16\n"
+      "fsub v14.4s, v21.4s, v24.4s\n"
+      "ldr q30, [x14, %[in_col_stride1]]\n"
+      "fadd v2.4s, v2.4s, v3.4s\n"
+      "ldr q31, [x14, x21]\n"
+      "fsub v28.4s, v25.4s, v17.4s\n"
+      "add x13, x13, #16\n"
+      "fadd v27.4s, v27.4s, v18.4s\n"
+      "add x26, x26, #16\n"
+      "mov v21.16b, v29.16b\n"
+      "fadd v20.4s, v19.4s, v22.4s\n"
+      "fsub v17.4s, v19.4s, v22.4s\n"
+      "fadd v29.4s, v12.4s, v26.4s\n"
+      "fmul v28.4s, v28.4s, v0.s[0]\n"
+      "fsub v16.4s, v12.4s, v26.4s\n"
+      "fmla v21.4s, v18.4s, v0.s[1]\n"
+      "ldr q23, [x14, x22]\n"
+      "fadd v24.4s, v6.4s, v20.4s\n"
+      "mov v6.16b, v20.16b\n"
+      "fadd v25.4s, v30.4s, v31.4s\n"
+      "fsub v22.4s, v30.4s, v31.4s\n"
+      "fadd v20.4s, v14.4s, v28.4s\n"
+      "mov v3.16b, v14.16b\n"
+      "fmul v16.4s, v16.4s, v0.s[0]\n"
+      "fmla v6.4s, v29.4s, v0.s[1]\n"
+      "fadd v24.4s, v24.4s, v29.4s\n"
+      "ldr q26, [x14, x23]\n"
+      "fmla v3.4s, v28.4s, v0.s[1]\n"
+      "fadd v14.4s, v23.4s, v26.4s\n"
+      "fadd v29.4s, v10.4s, v25.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "mov v10.16b, v25.16b\n"
+      "fadd v31.4s, v11.4s, v27.4s\n"
+      "fsub v12.4s, v11.4s, v27.4s\n"
+      "ldr q18, [x14, x24]\n"
+      "fadd v3.4s, v3.4s, v9.4s\n"
+      "ldr q19, [x27]\n"
+      "fadd v29.4s, v29.4s, v14.4s\n"
+      "add x14, x14, #16\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fmla v10.4s, v14.4s, v0.s[1]\n"
+      "fadd v25.4s, v7.4s, v31.4s\n"
+      "mov v26.16b, v31.16b\n"
+      "fadd v31.4s, v15.4s, v20.4s\n"
+      "fsub v11.4s, v15.4s, v20.4s\n"
+      "fadd v28.4s, v24.4s, v29.4s\n"
+      "fsub v24.4s, v24.4s, v29.4s\n"
+      "fadd v30.4s, v13.4s, v21.4s\n"
+      "fsub v9.4s, v13.4s, v21.4s\n"
+      "fadd v20.4s, v17.4s, v16.4s\n"
+      "mov v7.16b, v17.16b\n"
+      "fadd v15.4s, v8.4s, v31.4s\n"
+      "mov v14.16b, v31.16b\n"
+      "fadd v25.4s, v25.4s, v28.4s\n"
+      "fmul v24.4s, v24.4s, v0.s[0]\n"
+      "fmla v7.4s, v16.4s, v0.s[1]\n"
+      "ldr q27, [x27, %[in_col_stride1]]\n"
+      "fmla v26.4s, v28.4s, v0.s[1]\n"
+      "ldr q29, [x27, x21]\n"
+      "fadd v13.4s, v4.4s, v30.4s\n"
+      "mov v4.16b, v30.16b\n"
+      "str q25, [%[outptr0]]\n"  // Store output (0, 0)
+      "fadd v17.4s, v22.4s, v23.4s\n"
+      "fadd v7.4s, v7.4s, v5.4s\n"
+      "ldr q28, [x27, x22]\n"
+      "str q26, [x28]\n"  // Store output (2, 0)
+      "mov v8.16b, v22.16b\n"
+      "fadd v16.4s, v27.4s, v29.4s\n"
+      "fsub v29.4s, v27.4s, v29.4s\n"
+      "fadd v21.4s, v12.4s, v24.4s\n"
+      "mov v26.16b, v12.16b\n"
+      "fmla v8.4s, v23.4s, v0.s[1]\n"
+      "fadd v22.4s, v20.4s, v17.4s\n"
+      "fsub v20.4s, v20.4s, v17.4s\n"
+      "ldr q23, [x27, x23]\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "mov v16.16b, v16.16b\n"
+      "str q21, [x17]\n"  // Store output (1, 0)
+      "fadd v30.4s, v28.4s, v23.4s\n"
+      "fadd v8.4s, v8.4s, v18.4s\n"
+      "ldr q25, [x27, x24]\n"
+      "fsub v27.4s, v28.4s, v23.4s\n"
+      "add x27, x27, #16\n"
+      "mov v5.16b, v29.16b\n"
+      "fmla v26.4s, v24.4s, v0.s[1]\n"
+      "fadd v19.4s, v19.4s, v30.4s\n"
+      "fmla v16.4s, v30.4s, v0.s[1]\n"
+      "fadd v15.4s, v15.4s, v22.4s\n"
+      "fmul v20.4s, v20.4s, v0.s[0]\n"
+      "fmul v27.4s, v27.4s, v0.s[0]\n"
+      "fmla v14.4s, v22.4s, v0.s[1]\n"
+      "mov v28.16b, v11.16b\n"
+      "fadd v21.4s, v6.4s, v10.4s\n"
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "fsub v10.4s, v6.4s, v10.4s\n"
+      "str q15, [%[outptr0], %[output_col_stride1]]\n"  // Store output (0, 1)
+      "fadd v12.4s, v11.4s, v20.4s\n"
+      "str q14, [x28, %[output_col_stride1]]\n"  // Store output (2, 1)
+      "fadd v18.4s, v29.4s, v27.4s\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "fmla v28.4s, v20.4s, v0.s[1]\n"
+      "str q26, [x18]\n"  // Store output (3, 0)
+      "fadd v13.4s, v13.4s, v21.4s\n"
+      "str q12, [x17, %[output_col_stride1]]\n"  // Store output (1, 1)
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "fmla v4.4s, v21.4s, v0.s[1]\n"
+      "mov v15.16b, v9.16b\n"
+      "fadd v5.4s, v5.4s, v25.4s\n"
+      "fadd v28.4s, v28.4s, v18.4s\n"
+      "str q13, [%[outptr0], x15]\n"  // Store output (0, 2)
+      "fadd v6.4s, v2.4s, v3.4s\n"
+      "fadd v13.4s, v9.4s, v10.4s\n"
+      "fmla v15.4s, v10.4s, v0.s[1]\n"
+      "str q4, [x28, x15]\n"  // Store output (2, 2)
+      "fadd v30.4s, v7.4s, v8.4s\n"
+      "str q28, [x18, %[output_col_stride1]]\n"  // Store output (3, 1)
+      "fsub v2.4s, v2.4s, v3.4s\n"
+      "fadd v1.4s, v1.4s, v6.4s\n"
+      "fsub v8.4s, v7.4s, v8.4s\n"
+      "str q13, [x17, x15]\n"  // Store output (1, 2)
+      "fadd v15.4s, v15.4s, v16.4s\n"
+      "mov v6.16b, v6.16b\n"
+      "mov v9.16b, v2.16b\n"
+      "fadd v1.4s, v1.4s, v30.4s\n"
+      "fmul v8.4s, v8.4s, v0.s[0]\n"
+      "str q15, [x18, x15]\n"  // Store output (3, 2)
+      "fmla v6.4s, v30.4s, v0.s[1]\n"
+      "str q1, [%[outptr0], x16]\n"  // Store output (0, 3)
+      "fadd v2.4s, v2.4s, v8.4s\n"
+      "str q6, [x28, x16]\n"  // Store output (2, 3)
+      "fmla v9.4s, v8.4s, v0.s[1]\n"
+      "add %[outptr0], %[outptr0], #16\n"
+      "add x28, x28, #16\n"
+      "str q2, [x17, x16]\n"  // Store output (1, 3)
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "add x17, x17, #16\n"
+      "str q9, [x18, x16]\n"  // Store output (3, 3)
+      "add x18, x18, #16\n"
+      "4:\n"  // Double
+      "cmp x20, #2\n"
+      "blt 5f\n"
+      "ldr d17, [%[inptr0]]\n"
+      "ldr d23, [%[inptr0], %[in_col_stride1]]\n"
+      "sub x20, x20, #2\n"
+      "ldr d27, [%[inptr0], x21]\n"
+      "ldr d24, [%[inptr0], x22]\n"
+      "fadd v4.4s, v23.4s, v27.4s\n"
+      "ldr d11, [%[inptr0], x23]\n"
+      "fadd v10.4s, v24.4s, v11.4s\n"
+      "ldr d12, [%[inptr0], x24]\n"
+      "fsub v13.4s, v23.4s, v27.4s\n"
+      "ldr d20, [x25]\n"
+      "fsub v11.4s, v24.4s, v11.4s\n"
+      "ldr d19, [x25, %[in_col_stride1]]\n"
+      "fadd v7.4s, v17.4s, v4.4s\n"
+      "ldr d22, [x25, x21]\n"
+      "mov v4.16b, v4.16b\n"
+      "ldr d14, [x25, x22]\n"
+      "mov v1.16b, v13.16b\n"
+      "ldr d18, [x25, x23]\n"
+      "fmul v11.4s, v11.4s, v0.s[0]\n"
+      "ldr d3, [x25, x24]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "ldr d16, [x13]\n"
+      "fmla v4.4s, v10.4s, v0.s[1]\n"
+      "ldr d21, [x13, %[in_col_stride1]]\n"
+      "fadd v2.4s, v19.4s, v22.4s\n"
+      "ldr d24, [x13, x21]\n"
+      "fadd v8.4s, v13.4s, v11.4s\n"
+      "ldr d25, [x13, x22]\n"
+      "fmla v1.4s, v11.4s, v0.s[1]\n"
+      "ldr d17, [x13, x23]\n"
+      "fadd v23.4s, v14.4s, v18.4s\n"
+      "ldr d9, [x13, x24]\n"
+      "fadd v11.4s, v20.4s, v2.4s\n"
+      "ldr d6, [x26]\n"
+      "fsub v15.4s, v19.4s, v22.4s\n"
+      "ldr d19, [x26, %[in_col_stride1]]\n"
+      "fadd v1.4s, v1.4s, v12.4s\n"
+      "ldr d22, [x26, x21]\n"
+      "fsub v31.4s, v14.4s, v18.4s\n"
+      "ldr d12, [x26, x22]\n"
+      "fadd v11.4s, v11.4s, v23.4s\n"
+      "ldr d26, [x26, x23]\n"
+      "mov v13.16b, v2.16b\n"
+      "ldr d5, [x26, x24]\n"
+      "mov v2.16b, v15.16b\n"
+      "ldr d10, [x14]\n"
+      "fmul v31.4s, v31.4s, v0.s[0]\n"
+      "add %[inptr0], %[inptr0], #8\n"
+      "fmla v13.4s, v23.4s, v0.s[1]\n"
+      "add x25, x25, #8\n"
+      "fadd v29.4s, v21.4s, v24.4s\n"
+      "add x13, x13, #8\n"
+      "fsub v14.4s, v21.4s, v24.4s\n"
+      "ldr d30, [x14, %[in_col_stride1]]\n"
+      "fadd v15.4s, v15.4s, v31.4s\n"
+      "add x26, x26, #8\n"
+      "fmla v2.4s, v31.4s, v0.s[1]\n"
+      "fadd v18.4s, v25.4s, v17.4s\n"
+      "fadd v27.4s, v16.4s, v29.4s\n"
+      "fsub v28.4s, v25.4s, v17.4s\n"
+      "mov v21.16b, v29.16b\n"
+      "fadd v20.4s, v19.4s, v22.4s\n"
+      "fsub v17.4s, v19.4s, v22.4s\n"
+      "ldr d31, [x14, x21]\n"
+      "fadd v2.4s, v2.4s, v3.4s\n"
+      "ldr d23, [x14, x22]\n"
+      "fadd v27.4s, v27.4s, v18.4s\n"
+      "fmul v28.4s, v28.4s, v0.s[0]\n"
+      "fmla v21.4s, v18.4s, v0.s[1]\n"
+      "fadd v29.4s, v12.4s, v26.4s\n"
+      "fadd v24.4s, v6.4s, v20.4s\n"
+      "fsub v16.4s, v12.4s, v26.4s\n"
+      "mov v6.16b, v20.16b\n"
+      "fadd v25.4s, v30.4s, v31.4s\n"
+      "fsub v22.4s, v30.4s, v31.4s\n"
+      "fadd v31.4s, v11.4s, v27.4s\n"
+      "fsub v12.4s, v11.4s, v27.4s\n"
+      "ldr d26, [x14, x23]\n"
+      "fadd v24.4s, v24.4s, v29.4s\n"
+      "fmul v16.4s, v16.4s, v0.s[0]\n"
+      "fmla v6.4s, v29.4s, v0.s[1]\n"
+      "mov v3.16b, v14.16b\n"
+      "fadd v20.4s, v14.4s, v28.4s\n"
+      "fadd v29.4s, v10.4s, v25.4s\n"
+      "mov v10.16b, v25.16b\n"
+      "fadd v25.4s, v7.4s, v31.4s\n"
+      "fmla v3.4s, v28.4s, v0.s[1]\n"
+      "fadd v14.4s, v23.4s, v26.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "mov v26.16b, v31.16b\n"
+      "fadd v31.4s, v15.4s, v20.4s\n"
+      "fsub v11.4s, v15.4s, v20.4s\n"
+      "fadd v20.4s, v17.4s, v16.4s\n"
+      "mov v7.16b, v17.16b\n"
+      "fadd v3.4s, v3.4s, v9.4s\n"
+      "ldr d18, [x14, x24]\n"
+      "fadd v29.4s, v29.4s, v14.4s\n"
+      "add x14, x14, #8\n"
+      "fmla v7.4s, v16.4s, v0.s[1]\n"
+      "ldr d19, [x27]\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fmla v10.4s, v14.4s, v0.s[1]\n"
+      "fadd v15.4s, v8.4s, v31.4s\n"
+      "mov v14.16b, v31.16b\n"
+      "fadd v28.4s, v24.4s, v29.4s\n"
+      "fsub v24.4s, v24.4s, v29.4s\n"
+      "fadd v7.4s, v7.4s, v5.4s\n"
+      "ldr d27, [x27, %[in_col_stride1]]\n"
+      "fadd v30.4s, v13.4s, v21.4s\n"
+      "fsub v9.4s, v13.4s, v21.4s\n"
+      "fadd v17.4s, v22.4s, v23.4s\n"
+      "mov v8.16b, v22.16b\n"
+      "fadd v25.4s, v25.4s, v28.4s\n"
+      "fmul v24.4s, v24.4s, v0.s[0]\n"
+      "fmla v26.4s, v28.4s, v0.s[1]\n"
+      "ldr d29, [x27, x21]\n"
+      "fmla v8.4s, v23.4s, v0.s[1]\n"
+      "ldr d28, [x27, x22]\n"
+      "fadd v13.4s, v4.4s, v30.4s\n"
+      "mov v4.16b, v30.16b\n"
+      "str d25, [%[outptr0]]\n"  // Store output (0, 0)
+      "fadd v16.4s, v27.4s, v29.4s\n"
+      "str d26, [x28]\n"  // Store output (2, 0)
+      "fsub v29.4s, v27.4s, v29.4s\n"
+      "fadd v8.4s, v8.4s, v18.4s\n"
+      "ldr d23, [x27, x23]\n"
+      "fadd v30.4s, v28.4s, v23.4s\n"
+      "ldr d25, [x27, x24]\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "add x27, x27, #8\n"
+      "fsub v27.4s, v28.4s, v23.4s\n"
+      "mov v16.16b, v16.16b\n"
+      "fadd v22.4s, v20.4s, v17.4s\n"
+      "fsub v20.4s, v20.4s, v17.4s\n"
+      "fadd v21.4s, v12.4s, v24.4s\n"
+      "mov v26.16b, v12.16b\n"
+      "fadd v19.4s, v19.4s, v30.4s\n"
+      "fmla v16.4s, v30.4s, v0.s[1]\n"
+      "fmul v27.4s, v27.4s, v0.s[0]\n"
+      "mov v5.16b, v29.16b\n"
+      "fmla v26.4s, v24.4s, v0.s[1]\n"
+      "fadd v15.4s, v15.4s, v22.4s\n"
+      "str d21, [x17]\n"  // Store output (1, 0)
+      "fmul v20.4s, v20.4s, v0.s[0]\n"
+      "fmla v14.4s, v22.4s, v0.s[1]\n"
+      "mov v28.16b, v11.16b\n"
+      "fadd v18.4s, v29.4s, v27.4s\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "str d15, [%[outptr0], %[output_col_stride1]]\n"  // Store output (0, 1)
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "fadd v12.4s, v11.4s, v20.4s\n"
+      "fmla v28.4s, v20.4s, v0.s[1]\n"
+      "str d14, [x28, %[output_col_stride1]]\n"  // Store output (2, 1)
+      "fadd v21.4s, v6.4s, v10.4s\n"
+      "fadd v5.4s, v5.4s, v25.4s\n"
+      "fsub v10.4s, v6.4s, v10.4s\n"
+      "str d26, [x18]\n"  // Store output (3, 0)
+      "mov v15.16b, v9.16b\n"
+      "str d12, [x17, %[output_col_stride1]]\n"  // Store output (1, 1)
+      "fadd v28.4s, v28.4s, v18.4s\n"
+      "fadd v13.4s, v13.4s, v21.4s\n"
+      "fmla v4.4s, v21.4s, v0.s[1]\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "fadd v6.4s, v2.4s, v3.4s\n"
+      "fadd v30.4s, v7.4s, v8.4s\n"
+      "fsub v2.4s, v2.4s, v3.4s\n"
+      "str d28, [x18, %[output_col_stride1]]\n"  // Store output (3, 1)
+      "fsub v8.4s, v7.4s, v8.4s\n"
+      "str d13, [%[outptr0], x15]\n"  // Store output (0, 2)
+      "str d4, [x28, x15]\n"  // Store output (2, 2)
+      "fadd v13.4s, v9.4s, v10.4s\n"
+      "fmla v15.4s, v10.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v6.4s\n"
+      "mov v6.16b, v6.16b\n"
+      "fmul v8.4s, v8.4s, v0.s[0]\n"
+      "mov v9.16b, v2.16b\n"
+      "str d13, [x17, x15]\n"  // Store output (1, 2)
+      "fadd v15.4s, v15.4s, v16.4s\n"
+      "fadd v1.4s, v1.4s, v30.4s\n"
+      "fmla v6.4s, v30.4s, v0.s[1]\n"
+      "fadd v2.4s, v2.4s, v8.4s\n"
+      "fmla v9.4s, v8.4s, v0.s[1]\n"
+      "str d15, [x18, x15]\n"  // Store output (3, 2)
+      "str d1, [%[outptr0], x16]\n"  // Store output (0, 3)
+      "str d2, [x17, x16]\n"  // Store output (1, 3)
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "str d6, [x28, x16]\n"  // Store output (2, 3)
+      "add %[outptr0], %[outptr0], #8\n"
+      "add x17, x17, #8\n"
+      "add x28, x28, #8\n"
+      "str d9, [x18, x16]\n"  // Store output (3, 3)
+      "add x18, x18, #8\n"
+      "5:\n"  // Scalar
+      "cbz x20, 6f\n"
+      "ldr s17, [%[inptr0]]\n"
+      "ldr s23, [%[inptr0], %[in_col_stride1]]\n"
+      "ldr s27, [%[inptr0], x21]\n"
+      "fadd v4.4s, v23.4s, v27.4s\n"
+      "ldr s24, [%[inptr0], x22]\n"
+      "fsub v13.4s, v23.4s, v27.4s\n"
+      "ldr s11, [%[inptr0], x23]\n"
+      "fadd v10.4s, v24.4s, v11.4s\n"
+      "ldr s12, [%[inptr0], x24]\n"
+      "fsub v11.4s, v24.4s, v11.4s\n"
+      "ldr s20, [x25]\n"
+      "fadd v7.4s, v17.4s, v4.4s\n"
+      "ldr s19, [x25, %[in_col_stride1]]\n"
+      "mov v4.16b, v4.16b\n"
+      "ldr s22, [x25, x21]\n"
+      "mov v1.16b, v13.16b\n"
+      "ldr s14, [x25, x22]\n"
+      "fmul v11.4s, v11.4s, v0.s[0]\n"
+      "ldr s18, [x25, x23]\n"
+      "fadd v7.4s, v7.4s, v10.4s\n"
+      "ldr s3, [x25, x24]\n"
+      "fmla v4.4s, v10.4s, v0.s[1]\n"
+      "ldr s16, [x13]\n"
+      "fadd v2.4s, v19.4s, v22.4s\n"
+      "ldr s21, [x13, %[in_col_stride1]]\n"
+      "fadd v8.4s, v13.4s, v11.4s\n"
+      "ldr s24, [x13, x21]\n"
+      "fmla v1.4s, v11.4s, v0.s[1]\n"
+      "ldr s25, [x13, x22]\n"
+      "fadd v23.4s, v14.4s, v18.4s\n"
+      "ldr s17, [x13, x23]\n"
+      "fadd v11.4s, v20.4s, v2.4s\n"
+      "ldr s9, [x13, x24]\n"
+      "fsub v15.4s, v19.4s, v22.4s\n"
+      "ldr s6, [x26]\n"
+      "fadd v1.4s, v1.4s, v12.4s\n"
+      "ldr s19, [x26, %[in_col_stride1]]\n"
+      "fsub v31.4s, v14.4s, v18.4s\n"
+      "ldr s22, [x26, x21]\n"
+      "fadd v11.4s, v11.4s, v23.4s\n"
+      "ldr s12, [x26, x22]\n"
+      "mov v13.16b, v2.16b\n"
+      "ldr s26, [x26, x23]\n"
+      "mov v2.16b, v15.16b\n"
+      "ldr s5, [x26, x24]\n"
+      "fmul v31.4s, v31.4s, v0.s[0]\n"
+      "ldr s10, [x14]\n"
+      "fmla v13.4s, v23.4s, v0.s[1]\n"
+      "fadd v29.4s, v21.4s, v24.4s\n"
+      "fsub v14.4s, v21.4s, v24.4s\n"
+      "fadd v18.4s, v25.4s, v17.4s\n"
+      "fsub v28.4s, v25.4s, v17.4s\n"
+      "ldr s30, [x14, %[in_col_stride1]]\n"
+      "fadd v15.4s, v15.4s, v31.4s\n"
+      "fmla v2.4s, v31.4s, v0.s[1]\n"
+      "fadd v27.4s, v16.4s, v29.4s\n"
+      "mov v21.16b, v29.16b\n"
+      "fadd v20.4s, v19.4s, v22.4s\n"
+      "fsub v17.4s, v19.4s, v22.4s\n"
+      "fmul v28.4s, v28.4s, v0.s[0]\n"
+      "ldr s31, [x14, x21]\n"
+      "fadd v2.4s, v2.4s, v3.4s\n"
+      "ldr s23, [x14, x22]\n"
+      "fadd v27.4s, v27.4s, v18.4s\n"
+      "fmla v21.4s, v18.4s, v0.s[1]\n"
+      "fadd v29.4s, v12.4s, v26.4s\n"
+      "fadd v24.4s, v6.4s, v20.4s\n"
+      "fsub v16.4s, v12.4s, v26.4s\n"
+      "mov v6.16b, v20.16b\n"
+      "fadd v25.4s, v30.4s, v31.4s\n"
+      "fsub v22.4s, v30.4s, v31.4s\n"
+      "fadd v20.4s, v14.4s, v28.4s\n"
+      "mov v3.16b, v14.16b\n"
+      "fadd v24.4s, v24.4s, v29.4s\n"
+      "fmla v6.4s, v29.4s, v0.s[1]\n"
+      "fmul v16.4s, v16.4s, v0.s[0]\n"
+      "ldr s26, [x14, x23]\n"
+      "fmla v3.4s, v28.4s, v0.s[1]\n"
+      "fadd v14.4s, v23.4s, v26.4s\n"
+      "fadd v29.4s, v10.4s, v25.4s\n"
+      "fsub v23.4s, v23.4s, v26.4s\n"
+      "mov v10.16b, v25.16b\n"
+      "fadd v31.4s, v11.4s, v27.4s\n"
+      "fsub v12.4s, v11.4s, v27.4s\n"
+      "ldr s18, [x14, x24]\n"
+      "fadd v3.4s, v3.4s, v9.4s\n"
+      "ldr s19, [x27]\n"
+      "fadd v29.4s, v29.4s, v14.4s\n"
+      "fmul v23.4s, v23.4s, v0.s[0]\n"
+      "fmla v10.4s, v14.4s, v0.s[1]\n"
+      "fadd v25.4s, v7.4s, v31.4s\n"
+      "mov v26.16b, v31.16b\n"
+      "fadd v31.4s, v15.4s, v20.4s\n"
+      "fsub v11.4s, v15.4s, v20.4s\n"
+      "fadd v30.4s, v13.4s, v21.4s\n"
+      "fsub v9.4s, v13.4s, v21.4s\n"
+      "fadd v28.4s, v24.4s, v29.4s\n"
+      "fsub v24.4s, v24.4s, v29.4s\n"
+      "ldr s27, [x27, %[in_col_stride1]]\n"
+      "fadd v15.4s, v8.4s, v31.4s\n"
+      "mov v14.16b, v31.16b\n"
+      "fadd v13.4s, v4.4s, v30.4s\n"
+      "mov v4.16b, v30.16b\n"
+      "fadd v25.4s, v25.4s, v28.4s\n"
+      "fmla v26.4s, v28.4s, v0.s[1]\n"
+      "fmul v24.4s, v24.4s, v0.s[0]\n"
+      "fadd v21.4s, v6.4s, v10.4s\n"
+      "fsub v10.4s, v6.4s, v10.4s\n"
+      "fadd v6.4s, v2.4s, v3.4s\n"
+      "fsub v2.4s, v2.4s, v3.4s\n"
+      "ldr s29, [x27, x21]\n"
+      "str s25, [%[outptr0]]\n"  // Store output (0, 0)
+      "fadd v20.4s, v17.4s, v16.4s\n"
+      "str s26, [x28]\n"  // Store output (2, 0)
+      "mov v7.16b, v17.16b\n"
+      "fadd v17.4s, v22.4s, v23.4s\n"
+      "mov v8.16b, v22.16b\n"
+      "fadd v13.4s, v13.4s, v21.4s\n"
+      "fmul v10.4s, v10.4s, v0.s[0]\n"
+      "fmla v7.4s, v16.4s, v0.s[1]\n"
+      "ldr s28, [x27, x22]\n"
+      "fmla v8.4s, v23.4s, v0.s[1]\n"
+      "ldr s23, [x27, x23]\n"
+      "fmla v4.4s, v21.4s, v0.s[1]\n"
+      "ldr s25, [x27, x24]\n"
+      "str s13, [%[outptr0], x15]\n"  // Store output (0, 2)
+      "fadd v16.4s, v27.4s, v29.4s\n"
+      "fadd v7.4s, v7.4s, v5.4s\n"
+      "fadd v30.4s, v28.4s, v23.4s\n"
+      "fadd v8.4s, v8.4s, v18.4s\n"
+      "fsub v29.4s, v27.4s, v29.4s\n"
+      "str s4, [x28, x15]\n"  // Store output (2, 2)
+      "fsub v27.4s, v28.4s, v23.4s\n"
+      "fadd v19.4s, v19.4s, v16.4s\n"
+      "mov v16.16b, v16.16b\n"
+      "fadd v21.4s, v12.4s, v24.4s\n"
+      "mov v26.16b, v12.16b\n"
+      "mov v5.16b, v29.16b\n"
+      "fadd v22.4s, v20.4s, v17.4s\n"
+      "fmul v27.4s, v27.4s, v0.s[0]\n"
+      "fmla v16.4s, v30.4s, v0.s[1]\n"
+      "fadd v19.4s, v19.4s, v30.4s\n"
+      "fmla v26.4s, v24.4s, v0.s[1]\n"
+      "str s21, [x17]\n"  // Store output (1, 0)
+      "fsub v20.4s, v20.4s, v17.4s\n"
+      "fadd v15.4s, v15.4s, v22.4s\n"
+      "fmla v14.4s, v22.4s, v0.s[1]\n"
+      "fadd v18.4s, v29.4s, v27.4s\n"
+      "fmla v5.4s, v27.4s, v0.s[1]\n"
+      "fadd v26.4s, v26.4s, v19.4s\n"
+      "mov v28.16b, v11.16b\n"
+      "fmul v20.4s, v20.4s, v0.s[0]\n"
+      "fadd v13.4s, v9.4s, v10.4s\n"
+      "str s15, [%[outptr0], %[output_col_stride1]]\n"  // Store output (0, 1)
+      "mov v15.16b, v9.16b\n"
+      "str s14, [x28, %[output_col_stride1]]\n"  // Store output (2, 1)
+      "fadd v5.4s, v5.4s, v25.4s\n"
+      "str s26, [x18]\n"  // Store output (3, 0)
+      "fadd v30.4s, v7.4s, v8.4s\n"
+      "str s13, [x17, x15]\n"  // Store output (1, 2)
+      "fadd v12.4s, v11.4s, v20.4s\n"
+      "fmla v28.4s, v20.4s, v0.s[1]\n"
+      "fmla v15.4s, v10.4s, v0.s[1]\n"
+      "fadd v1.4s, v1.4s, v6.4s\n"
+      "fsub v8.4s, v7.4s, v8.4s\n"
+      "mov v6.16b, v6.16b\n"
+      "mov v9.16b, v2.16b\n"
+      "str s12, [x17, %[output_col_stride1]]\n"  // Store output (1, 1)
+      "fadd v28.4s, v28.4s, v18.4s\n"
+      "fadd v15.4s, v15.4s, v16.4s\n"
+      "fadd v1.4s, v1.4s, v30.4s\n"
+      "fmul v8.4s, v8.4s, v0.s[0]\n"
+      "fmla v6.4s, v30.4s, v0.s[1]\n"
+      "str s28, [x18, %[output_col_stride1]]\n"  // Store output (3, 1)
+      "str s1, [%[outptr0], x16]\n"  // Store output (0, 3)
+      "str s6, [x28, x16]\n"  // Store output (2, 3)
+      "fadd v2.4s, v2.4s, v8.4s\n"
+      "str s15, [x18, x15]\n"  // Store output (3, 2)
+      "fmla v9.4s, v8.4s, v0.s[1]\n"
+      "str s2, [x17, x16]\n"  // Store output (1, 3)
+      "fadd v9.4s, v9.4s, v5.4s\n"
+      "str s9, [x18, x16]\n"  // Store output (3, 3)
+      "6:\n"  // End
+      : [outptr0] "+r" (output), [inptr0] "+r" (inptr)
+      : [output_col_stride1] "r" (output_col_stride * sizeof(float)), [pcoeffs] "r" (coeffs), [n_channels] "r" ((long) n_channels), [in_row_stride] "r" (6 * matrix_stride * sizeof(float)), [in_col_stride1] "r" (matrix_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+      : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+    );
+  }
+}
+
+#else
+
+template <>
+void winograd::OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>::transform_tile(
+  const int n_channels,
+  const float* inptr,
+  const int matrix_stride,
+  const float* bptr,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  // Construct a map to the output cells
+  float *outptrs[output_tile_rows][output_tile_cols];
+  for (int i = 0; i < output_tile_rows; i++)
+  {
+    for (int j = 0; j < output_tile_cols; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __arm__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    else
+    {
+      b = vdup_n_f32(0.0f);
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif  // __arm__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][4], f[4][4], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    if (bptr != nullptr)
+    {
+      b = *(bptr++);
+    }
+    else
+    {
+      b = 0.0f;
+    }
+    for (int i = 0; i < output_tile_rows; i++)
+    {
+      for (int j = 0; j < output_tile_cols; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j] + b;
+      }
+    }
+  }
+}
+
+#endif
+
+template class OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
similarity index 74%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
index 58bed71..ce921ce 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,42 +22,29 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "output.hpp"
+#include "arm.hpp"
 
-namespace
+namespace winograd
 {
 
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_6_3_fp32_process_tile(
+template <>
+void OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
   const int n_channels,
-  const float* const matrix_base,
+  const float* inptr,
   const int matrix_stride,
-  const float* const biases,
+  const float* bptr,
   float* const output,
-  const int output_row_stride,
-  const int output_col_stride,
-  const int _pad_bottom,
-  const int _pad_right
+  const int,  // No need to stride across rows
+  const int output_col_stride
 )
 {
-  (void) output_row_stride;
-  (void) _pad_bottom;
-  constexpr int output_tile_cols = 6;
-  constexpr int inner_tile_cols = 8;
-
-  const int pad_right = Specialized ? PadRight : _pad_right;
-  const int cells_j = output_tile_cols - pad_right;
-
   // Construct a map to the output cells
-  float *outptrs[cells_j];
-  for (int j = 0; j < cells_j; j++)
+  float *outptrs[output_tile_cols];
+  for (int j = 0; j < output_tile_cols; j++)
   {
     outptrs[j] = output + j*output_col_stride;
   }
-  const float *inptr = matrix_base;
-  const float *bptr = biases;
 
   // For each channel of the output
   int channels_remaining = n_channels;
@@ -87,7 +74,7 @@
       b = vld1q_f32(bptr);
       bptr += 4;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1q_f32(outptrs[j], f[j] + b);
       outptrs[j] += 4;
@@ -118,7 +105,7 @@
       b = vld1_f32(bptr);
       bptr += 2;
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       vst1_f32(outptrs[j], f[j] + b);
       outptrs[j] += 2;
@@ -149,31 +136,14 @@
     {
       b = *(bptr++);
     }
-    for (int j = 0; j < cells_j; j++)
+    for (int j = 0; j < output_tile_cols; j++)
     {
       *(outptrs[j]++) = f[j] + b;
     }
   }
 }
 
-}  // namespace (anonymous)
+template class OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
 
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
-  winograd_output_transform_6_3_fp32_process_tile<true, 1>,
-  winograd_output_transform_6_3_fp32_process_tile<true, 2>,
-  winograd_output_transform_6_3_fp32_process_tile<true, 3>,
-  winograd_output_transform_6_3_fp32_process_tile<true, 4>,
-  winograd_output_transform_6_3_fp32_process_tile<true, 5>,
-};
-
-template class OutputTransform<1, 3, 1, 8, float>;
-template class OutputTransform<3, 1, 8, 1, float>;
-}  // namespace winograd
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..37ae43f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,  // NOTE: Data in HWIO order
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const float *inptrs[kernel_cols];
+  for (int j = 0; j < kernel_cols; j++)
+  {
+    inptrs[j] = input + j*weight_col_stride;
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[kernel_cols], V[inner_tile_cols];
+
+      // Read weights
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[j] = *(inptrs[j]++);
+      }
+
+      // Compute V = w WT
+      V[0] = (w[0]*-1) / 36.0f;
+      V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
+      V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
+      V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
+      V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
+      V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
+      V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
+      V[7] = (w[6]*1) / 1.0f;
+
+      // Store the transformed weights
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        *(outptr + j*matrix_stride) = V[j];
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..8fab6db
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  constexpr int inner_tile_i = 4;
+  constexpr int inner_tile_j = 4;
+
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const auto weight_row_stride = 3 * weight_col_stride;
+  const float *inptrs[3][3];
+  for (int i = 0; i < 3; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+    }
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
+    {
+      // Matrices used and computed in this kernel
+      float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1q_f32(inptrs[i][j]);
+          inptrs[i][j] += 4;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        Ww[0][j] = w[0][j];
+
+        // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+        Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+        // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+        Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+        Ww[3][j] = w[2][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < inner_tile_i; i++)
+      {
+        V[i][0] = Ww[i][0];
+
+        // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+        V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+        // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+        V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+        V[i][3] = Ww[i][2];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < inner_tile_i; i++)
+      {
+        for (int j = 0; j < inner_tile_j; j++, m++)
+        {
+          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 4;
+    }
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
+    {
+      // Matrices used and computed in this kernel
+      float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1_f32(inptrs[i][j]);
+          inptrs[i][j] += 2;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        Ww[0][j] = w[0][j];
+
+        // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+        Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+        // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+        Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+        Ww[3][j] = w[2][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < inner_tile_i; i++)
+      {
+        V[i][0] = Ww[i][0];
+
+        // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+        V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+        // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+        V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+        V[i][3] = Ww[i][2];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < inner_tile_i; i++)
+      {
+        for (int j = 0; j < inner_tile_j; j++, m++)
+        {
+          vst1_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 2;
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = *(inptrs[i][j]++);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        Ww[0][j] = w[0][j];
+        Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+        Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+        Ww[3][j] = w[2][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < inner_tile_i; i++)
+      {
+        V[i][0] = Ww[i][0];
+        V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+        V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+        V[i][3] = Ww[i][2];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < inner_tile_i; i++)
+      {
+        for (int j = 0; j < inner_tile_j; j++, m++)
+        {
+          *(outptr + m*matrix_stride) = V[i][j];
+        }
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..79f4fa3
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const auto weight_row_stride = 5 * weight_col_stride;
+  const float *inptrs[5][5];
+  for (int i = 0; i < 5; i++)
+  {
+    for (int j = 0; j < 5; j++)
+    {
+      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+    }
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
+    {
+      // Matrices used and computed in this kernel
+      float32x4_t w[5][5], Ww[6][5], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 5; i++)
+      {
+        for (int j = 0; j < 5; j++)
+        {
+          w[i][j] = vld1q_f32(inptrs[i][j]);
+          inptrs[i][j] += 4;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 5; j++)
+      {
+        // Ww[0][j] = w[0][j]/4.0f;
+        Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
+
+        // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+        Ww[1][j] = vmulq_n_f32(
+          vaddq_f32(
+            vaddq_f32(
+              vaddq_f32(w[1][j], w[0][j]),
+              vaddq_f32(w[3][j], w[2][j])
+            ),
+            w[4][j]
+          ),
+          -1.0f/6.0f
+        );
+
+        // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+        // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+        Ww[2][j] = vmulq_n_f32(
+          vsubq_f32(
+            vaddq_f32(
+              vsubq_f32(w[1][j], w[0][j]),
+              vsubq_f32(w[3][j], w[2][j])
+            ),
+            w[4][j]
+          ),
+          1.0f/6.0f
+        );
+
+        // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+        Ww[3][j] = vmulq_n_f32(
+          vmlaq_n_f32(
+            vaddq_f32(
+              vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+              vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+            ),
+            w[4][j], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+        Ww[4][j] = vmulq_n_f32(
+          vmlaq_n_f32(
+            vaddq_f32(
+              vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+              vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+            ),
+            w[4][j], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // Ww[5][j] = w[4][j];
+        Ww[5][j] = w[4][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        // V[i][0] = Ww[i][0]/4.0f;
+        V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
+
+        // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+        V[i][1] = vmulq_n_f32(
+          vaddq_f32(
+            vaddq_f32(
+              vaddq_f32(Ww[i][1], Ww[i][0]),
+              vaddq_f32(Ww[i][3], Ww[i][2])
+            ),
+            Ww[i][4]
+          ),
+          -1.0f/6.0f
+        );
+
+        // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+        // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+        V[i][2] = vmulq_n_f32(
+          vsubq_f32(
+            vaddq_f32(
+              vsubq_f32(Ww[i][1], Ww[i][0]),
+              vsubq_f32(Ww[i][3], Ww[i][2])
+            ),
+            Ww[i][4]
+          ),
+          1.0f/6.0f
+        );
+
+        // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][3] = vmulq_n_f32(
+          vmlaq_n_f32(
+            vaddq_f32(
+              vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+              vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+            ),
+            Ww[i][4], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][4] = vmulq_n_f32(
+          vmlaq_n_f32(
+            vaddq_f32(
+              vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+              vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+            ),
+            Ww[i][4], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // V[i][5] = Ww[i][4];
+        V[i][5] = Ww[i][4];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 4;
+    }
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
+    {
+      // Matrices used and computed in this kernel
+      float32x2_t w[5][5], Ww[6][5], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 5; i++)
+      {
+        for (int j = 0; j < 5; j++)
+        {
+          w[i][j] = vld1_f32(inptrs[i][j]);
+          inptrs[i][j] += 2;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 5; j++)
+      {
+        // Ww[0][j] = w[0][j]/4.0f;
+        Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
+
+        // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+        Ww[1][j] = vmul_n_f32(
+          vadd_f32(
+            vadd_f32(
+              vadd_f32(w[1][j], w[0][j]),
+              vadd_f32(w[3][j], w[2][j])
+            ),
+            w[4][j]
+          ),
+          -1.0f/6.0f
+        );
+
+        // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+        // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+        Ww[2][j] = vmul_n_f32(
+          vsub_f32(
+            vadd_f32(
+              vsub_f32(w[1][j], w[0][j]),
+              vsub_f32(w[3][j], w[2][j])
+            ),
+            w[4][j]
+          ),
+          1.0f/6.0f
+        );
+
+        // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+        Ww[3][j] = vmul_n_f32(
+          vmla_n_f32(
+            vadd_f32(
+              vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+              vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+            ),
+            w[4][j], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+        Ww[4][j] = vmul_n_f32(
+          vmla_n_f32(
+            vadd_f32(
+              vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+              vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+            ),
+            w[4][j], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // Ww[5][j] = w[4][j];
+        Ww[5][j] = w[4][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        // V[i][0] = Ww[i][0]/4.0f;
+        V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
+
+        // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+        V[i][1] = vmul_n_f32(
+          vadd_f32(
+            vadd_f32(
+              vadd_f32(Ww[i][1], Ww[i][0]),
+              vadd_f32(Ww[i][3], Ww[i][2])
+            ),
+            Ww[i][4]
+          ),
+          -1.0f/6.0f
+        );
+
+        // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+        // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+        V[i][2] = vmul_n_f32(
+          vsub_f32(
+            vadd_f32(
+              vsub_f32(Ww[i][1], Ww[i][0]),
+              vsub_f32(Ww[i][3], Ww[i][2])
+            ),
+            Ww[i][4]
+          ),
+          1.0f/6.0f
+        );
+
+        // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][3] = vmul_n_f32(
+          vmla_n_f32(
+            vadd_f32(
+              vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+              vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+            ),
+            Ww[i][4], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][4] = vmul_n_f32(
+          vmla_n_f32(
+            vadd_f32(
+              vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+              vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+            ),
+            Ww[i][4], 2.0f
+          ),
+          1.0f/3.0f
+        );
+
+        // V[i][5] = Ww[i][4];
+        V[i][5] = Ww[i][4];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 2;
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[5][5], Ww[6][5], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 5; i++)
+      {
+        for (int j = 0; j < 5; j++)
+        {
+          w[i][j] = *(inptrs[i][j]++);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 5; j++)
+      {
+        Ww[0][j] = w[0][j]/4.0f;
+        Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+        Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+        Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+        Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+        Ww[5][j] = w[4][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        V[i][0] = Ww[i][0]/4.0f;
+        V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+        V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+        V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+        V[i][5] = Ww[i][4];
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          *(outptr + m*matrix_stride) = V[i][j];
+        }
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..fb3d712
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,  // NOTE: Data in HWIO order
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const float *inptrs[kernel_cols];
+  for (int j = 0; j < kernel_cols; j++)
+  {
+    inptrs[j] = input + j*weight_col_stride;
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[kernel_cols], V[inner_tile_cols];
+
+      // Read weights
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[j] = *(inptrs[j]++);
+      }
+
+      // Compute V = w WT
+      V[0] = (w[0]*-1) / 36;
+      V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
+      V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
+      V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
+      V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
+      V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
+      V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
+      V[7] = (w[4]*1) / 1;
+
+      // Store the transformed weights
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        *(outptr + j*matrix_stride) = V[j];
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+}  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..9e7040b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,  // NOTE: Data in HWIO order
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const auto weight_row_stride = 3 * weight_col_stride;
+  const float *inptrs[3][3];
+  for (int i = 0; i < 3; i++)
+  {
+    for (int j = 0; j < 3; j++)
+    {
+      inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+    }
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
+    {
+      // Matrices used and computed in this kernel
+      float32x4_t w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1q_f32(inptrs[i][j]);
+          inptrs[i][j] += 4;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        // Ww[0][j] =  6*w[0][j];
+        Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
+
+        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+        // Ww[5][j] = 24*w[2][j];
+        Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        const float recip576 = 1.0f / 576.0f;
+
+        // V[i][0] =  6*Ww[i][0];
+        V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
+
+        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+        V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+        V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+        V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+        V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+        // V[i][5] = 24*Ww[i][2];
+        V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 4;
+    }
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
+    {
+      // Matrices used and computed in this kernel
+      float32x2_t w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = vld1_f32(inptrs[i][j]);
+          inptrs[i][j] += 2;
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        // Ww[0][j] =  6*w[0][j];
+        Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
+
+        // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+        // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+        // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+        // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+        // Ww[5][j] = 24*w[2][j];
+        Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        const float recip576 = 1.0f / 576.0f;
+
+        // V[i][0] =  6*Ww[i][0];
+        V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
+
+        // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+        V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+        // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+        V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+        // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+        V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+        // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+        V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+        // V[i][5] = 24*Ww[i][2];
+        V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          vst1_f32(outptr + m*matrix_stride, V[i][j]);
+        }
+      }
+      outptr += 2;
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[3][3], Ww[6][3], V[6][6];
+
+      // Read weights
+      for (int i = 0; i < 3; i++)
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          w[i][j] = *(inptrs[i][j]++);
+        }
+      }
+
+      // Compute the matrix W w
+      for (int j = 0; j < 3; j++)
+      {
+        Ww[0][j] =  6*w[0][j];
+        Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+        Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+        Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+        Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+        Ww[5][j] = 24*w[2][j];
+      }
+
+      // Compute V = W w WT
+      for (int i = 0; i < 6; i++)
+      {
+        V[i][0] = ( 6*Ww[i][0]) / 576.0;
+        V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+        V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+        V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+        V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+        V[i][5] = (24*Ww[i][2]) / 576.0;
+      }
+
+      // Store the transformed weights
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          *(outptr + m*matrix_stride) = V[i][j];
+        }
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..4572348
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::execute(
+  const int n_output_channels,
+  const int n_input_channels,
+  const float* const input,  // NOTE: Data in HWIO order
+  float* const output,
+  const int matrix_stride,
+  const int matrix_row_stride
+)
+{
+  // Get pointers to each cell of the weight tensor
+  const auto weight_col_stride = n_input_channels * n_output_channels;
+  const float *inptrs[3];
+  for (int j = 0; j < 3; j++)
+  {
+    inptrs[j] = input + j*weight_col_stride;
+  }
+
+  // For each input channel
+  for (int ic = 0; ic < n_input_channels; ic++)
+  {
+    float *outptr = output + ic * matrix_row_stride;
+
+    // For each output channel
+    int channels_remaining = n_output_channels;
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed in this kernel
+      float w[3], V[inner_tile_cols];
+
+      // Read weights
+      for (int j = 0; j < 3; j++)
+      {
+        w[j] = *(inptrs[j]++);
+      }
+
+      // Compute V = w WT
+      V[0] = (w[0]*-1) / 36.0f;
+      V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
+      V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
+      V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
+      V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
+      V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
+      V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
+      V[7] = (w[2]*1) / 1;
+
+      // Store the transformed weights
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        *(outptr + j*matrix_stride) = V[j];
+      }
+      outptr++;
+    }
+  }
+}
+
+template class WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+}  // namespace
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index fea635b..da6e5f6 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,13 @@
         }
         case RoundingPolicy::TO_NEAREST_EVEN:
         {
+#ifdef __aarch64__
+            asm("fcvtns %x[res], %s[value]"
+                : [res] "=r"(rounded)
+                : [value] "w"(x));
+#else  // __aarch64__
             ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
+#endif // __aarch64__
             break;
         }
         default:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 73eaf64..589b737 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -326,24 +326,30 @@
     return res;
 }
 
-PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout)
+PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation)
 {
     const unsigned int width_idx       = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx      = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const auto        &strides         = conv_info.stride();
     const int          out_width       = std::ceil(float(input_shape[width_idx]) / float(strides.first));
     const int          out_height      = std::ceil(float(input_shape[height_idx]) / float(strides.second));
-    const int          pad_width       = ((out_width - 1) * strides.first + weights_shape[width_idx] - input_shape[width_idx]);
-    const int          pad_height      = ((out_height - 1) * strides.second + weights_shape[height_idx] - input_shape[height_idx]);
+    const int          pad_width       = (out_width - 1) * strides.first + (weights_shape[width_idx] + (dilation.x() - 1) * (weights_shape[width_idx] - 1) - input_shape[width_idx]);
+    const int          pad_height      = (out_height - 1) * strides.second + (weights_shape[height_idx] + (dilation.y() - 1) * (weights_shape[height_idx] - 1) - input_shape[height_idx]);
     const int          same_pad_left   = pad_width / 2;
     const int          same_pad_top    = pad_height / 2;
     const int          same_pad_right  = pad_width - same_pad_left;
     const int          same_pad_bottom = pad_height - same_pad_top;
 
-    return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
+    return { static_cast<unsigned int>(strides.first),
+             static_cast<unsigned int>(strides.second),
+             static_cast<unsigned int>(same_pad_left),
+             static_cast<unsigned int>(same_pad_right),
+             static_cast<unsigned int>(same_pad_top),
+             static_cast<unsigned int>(same_pad_bottom),
+             DimensionRoundingType::CEIL };
 }
 
-const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
+std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
     unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
     unsigned int stride_x, unsigned int stride_y)
 {
@@ -356,10 +362,10 @@
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
-                                                                           unsigned int kernel_width, unsigned int kernel_height,
-                                                                           const PadStrideInfo &pad_stride_info,
-                                                                           const Size2D        &dilation)
+std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
+                                                                     unsigned int kernel_width, unsigned int kernel_height,
+                                                                     const PadStrideInfo &pad_stride_info,
+                                                                     const Size2D        &dilation)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
     const unsigned int pad_top    = pad_stride_info.pad_top();
@@ -383,18 +389,6 @@
             ARM_COMPUTE_ERROR("Unsupported rounding type");
     }
 
-    // Make sure that border operations will start from inside the input and not the padded area
-    if(((w - 1) * stride_x) >= (width + pad_left))
-    {
-        --w;
-    }
-    if(((h - 1) * stride_y) >= (height + pad_top))
-    {
-        --h;
-    }
-    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_left));
-    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_top));
-
     return std::make_pair(w, h);
 }
 
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
new file mode 100644
index 0000000..7ff2fdf
--- /dev/null
+++ b/src/core/utils/helpers/fft.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/helpers/fft.h"
+
+#include <numeric>
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace fft
+{
+std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsigned int> &supported_factors)
+{
+    std::vector<unsigned int> stages;
+    unsigned int              res = N;
+
+    // Early exit if no supported factors are provided
+    if(supported_factors.empty())
+    {
+        return stages;
+    }
+
+    // Create reverse iterator (Start decomposing from the larger supported factors)
+    auto rfactor_it = supported_factors.rbegin();
+
+    // Decomposition step
+    while(res != 0)
+    {
+        const unsigned int factor = *rfactor_it;
+        if(0 == (res % factor) && res >= factor)
+        {
+            stages.push_back(factor);
+            res /= factor;
+        }
+        else
+        {
+            ++rfactor_it;
+            if(rfactor_it == supported_factors.rend())
+            {
+                if(res > 1)
+                {
+                    // Couldn't decompose with given factors
+                    stages.clear();
+                    return stages;
+                }
+                else
+                {
+                    res = 0;
+                }
+            }
+        }
+    }
+
+    return stages;
+}
+
+std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vector<unsigned int> &fft_stages)
+{
+    std::vector<unsigned int> idx_digit_reverse;
+
+    // Early exit in case N and fft stages do not match
+    const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+    if(stages_prod != N)
+    {
+        return idx_digit_reverse;
+    }
+
+    // Resize digit reverse vector
+    idx_digit_reverse.resize(N);
+
+    // Get number of radix stages
+    unsigned int n_stages = fft_stages.size();
+
+    // Scan elements
+    for(unsigned int n = 0; n < N; ++n)
+    {
+        unsigned int k  = n;
+        unsigned int Nx = fft_stages[0];
+
+        // Scan stages
+        for(unsigned int s = 1; s < n_stages; ++s)
+        {
+            // radix of stage i-th
+            unsigned int Ny = fft_stages[s];
+            unsigned int Ni = Ny * Nx;
+
+            // Update k index
+            k = (k * Ny) % Ni + (k / Nx) % Ny + Ni * (k / Ni);
+
+            // Update Nx
+            Nx *= Ny;
+        }
+
+        // K is the index of digit-reverse
+        idx_digit_reverse[n] = k;
+    }
+
+    return idx_digit_reverse;
+}
+} // namespace fft
+} // namespace helpers
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index 08803c7..f6a54a5 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -114,7 +114,10 @@
                                                                                  Coordinates starts, Coordinates ends, Coordinates strides,
                                                                                  int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
-    Coordinates starts_abs, ends_abs, final_strides;
+    Coordinates starts_abs{};
+    Coordinates ends_abs{};
+    Coordinates final_strides{};
+
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index 99236d2..055e770 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,12 +42,12 @@
     return _instance;
 }
 
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     std::lock_guard<arm_compute::Mutex> lock(_mtx);
     if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
     {
-        _loggers[name] = std::make_shared<Logger>(name, log_level, std::move(printers));
+        _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
     }
 }
 
@@ -66,7 +66,7 @@
     return (_loggers.find(name) != _loggers.end()) ? _loggers[name] : nullptr;
 }
 
-void LoggerRegistry::create_reserved_loggers(LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     std::lock_guard<arm_compute::Mutex> lock(_mtx);
     for(const auto &r : _reserved_loggers)
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index ea9ba77..d606adb 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,12 @@
 
 using namespace arm_compute::quantization;
 
-constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+constexpr int64_t fixed_point_one_Q0 = (1LL << 31);
 constexpr float   epsilon            = 0.00001f;
 
 arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(float multiplier,
-                                                                                            int   *quant_multiplier,
-                                                                                            int   *right_shift)
+                                                                                            int *quant_multiplier,
+                                                                                            int *right_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
@@ -71,8 +71,8 @@
 }
 
 arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(float multiplier,
-                                                                                               int   *quantized_multiplier,
-                                                                                               int   *left_shift)
+                                                                                               int *quantized_multiplier,
+                                                                                               int *left_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 88e2682..9d437b1 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -152,7 +152,7 @@
     return true;
 }
 
-TensorID Graph::create_tensor(TensorDescriptor desc)
+TensorID Graph::create_tensor(const TensorDescriptor &desc)
 {
     TensorID tid    = _tensors.size();
     auto     tensor = support::cpp14::make_unique<Tensor>(tid, desc);
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index a944d2c..5db9540 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -30,15 +30,19 @@
 
 #include "support/ToolchainSupport.h"
 
-#define CHECK_NODEIDX_PAIR(pair, g) \
-    ARM_COMPUTE_ERROR_ON(((pair).node_id >= (g).nodes().size()) || ((g).node((pair).node_id) == nullptr) || ((pair).index >= (g).node((pair).node_id)->num_outputs()));
-
 namespace arm_compute
 {
 namespace graph
 {
 namespace
 {
+inline void check_nodeidx_pair(const NodeIdxPair &pair, const Graph &g)
+{
+    ARM_COMPUTE_UNUSED(pair);
+    ARM_COMPUTE_UNUSED(g);
+    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs()));
+}
+
 Status set_node_params(Graph &g, NodeID nid, NodeParams &params)
 {
     INode *node = g.node(nid);
@@ -62,10 +66,10 @@
     return Status{};
 }
 
-NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     params.name = params.name.empty() ? "" : params.name + name;
-    auto nid    = GraphBuilder::add_const_node(g, params, std::move(desc), std::move(accessor));
+    auto nid    = GraphBuilder::add_const_node(g, params, desc, std::move(accessor));
     set_node_params(g, nid, params);
     return nid;
 }
@@ -73,7 +77,7 @@
 template <typename NT, typename... Args>
 NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&... args)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
 
     NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
     g.add_connection(input.node_id, input.index, nid, 0);
@@ -81,9 +85,27 @@
 
     return nid;
 }
+
+template <typename NT, typename... Args>
+NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &params, const std::vector<NodeIdxPair> &inputs, Args &&... args)
+{
+    ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
+
+    NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
+
+    unsigned int i = 0;
+    for(const auto &input : inputs)
+    {
+        check_nodeidx_pair(input, g);
+        g.add_connection(input.node_id, input.index, nid, i++);
+    }
+    set_node_params(g, nid, params);
+
+    return nid;
+}
 } // namespace
 
-NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<ConstNode>(desc);
     set_node_params(g, nid, params);
@@ -91,7 +113,7 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<InputNode>(desc);
     set_node_params(g, nid, params);
@@ -101,7 +123,7 @@
 
 NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
 
     NodeID nid = g.add_node<OutputNode>();
     g.add_connection(input.node_id, input.index, nid, 0);
@@ -111,16 +133,17 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info)
+NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+                                         const QuantizationInfo out_quant_info)
 {
-    return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info);
+    return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
 }
 
 NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
                                                   ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
 
     bool has_beta  = (beta_accessor != nullptr);
     bool has_gamma = (gamma_accessor != nullptr);
@@ -170,8 +193,8 @@
 
 NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
 {
-    CHECK_NODEIDX_PAIR(input, g);
-    CHECK_NODEIDX_PAIR(deltas, g);
+    check_nodeidx_pair(input, g);
+    check_nodeidx_pair(deltas, g);
 
     NodeID nid = g.add_node<BoundingBoxTransformLayerNode>(info);
 
@@ -194,7 +217,7 @@
                                           const QuantizationInfo weights_quant_info,
                                           const QuantizationInfo out_quant_info)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(depth == 0);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
 
@@ -202,14 +225,15 @@
 
     // Get input tensor descriptor
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+    const DataLayout       input_data_layout = input_tensor_desc.layout;
 
     // Create weights node
     TensorDescriptor w_desc = input_tensor_desc;
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
     if(!weights_quant_info.empty())
     {
         w_desc.quant_info = weights_quant_info;
@@ -248,7 +272,7 @@
                                             Size2D inner_border, ITensorAccessorUPtr weights_accessor,
                                             ITensorAccessorUPtr bias_accessor)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(depth == 0);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
 
@@ -256,14 +280,15 @@
 
     // Get input tensor descriptor
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+    const DataLayout       input_data_layout = input_tensor_desc.layout;
 
     // Create weights node
     TensorDescriptor w_desc = input_tensor_desc;
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
 
     NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
 
@@ -293,40 +318,29 @@
     return deconv_nid;
 }
 
-NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs, DataLayoutDimension axis)
+NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, descriptors::ConcatLayerDescriptor concat_descriptor)
 {
-    ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
-
-    NodeID nid = g.add_node<ConcatenateLayerNode>(inputs.size(), axis);
-
-    unsigned int i = 0;
-    for(const auto &input : inputs)
-    {
-        CHECK_NODEIDX_PAIR(input, g);
-        g.add_connection(input.node_id, input.index, nid, i++);
-    }
-    set_node_params(g, nid, params);
-
-    return nid;
+    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(), concat_descriptor);
 }
 
 NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
                                                     PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
+                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info, const QuantizationInfo out_quant_info)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
 
     bool has_bias = (bias_accessor != nullptr);
 
     // Get input tensor descriptor
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+    const DataLayout       input_data_layout = input_tensor_desc.layout;
 
     // Create weights node
     TensorDescriptor w_desc = input_tensor_desc;
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
-    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+    w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
     if(!quant_info.empty())
     {
@@ -351,7 +365,7 @@
     }
 
     // Create convolution node and connect
-    NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method);
+    NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
     if(has_bias)
@@ -362,11 +376,11 @@
 
     return conv_nid;
 }
-NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info)
+NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info)
 {
-    CHECK_NODEIDX_PAIR(input_loc, g);
-    CHECK_NODEIDX_PAIR(input_conf, g);
-    CHECK_NODEIDX_PAIR(input_priorbox, g);
+    check_nodeidx_pair(input_loc, g);
+    check_nodeidx_pair(input_conf, g);
+    check_nodeidx_pair(input_priorbox, g);
 
     // Create detection_output node and connect
     NodeID detect_nid = g.add_node<DetectionOutputLayerNode>(detect_info);
@@ -386,8 +400,8 @@
 
 NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
 {
-    CHECK_NODEIDX_PAIR(input0, g);
-    CHECK_NODEIDX_PAIR(input1, g);
+    check_nodeidx_pair(input0, g);
+    check_nodeidx_pair(input1, g);
 
     NodeID nid = g.add_node<EltwiseLayerNode>(operation);
 
@@ -405,11 +419,38 @@
 }
 
 NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
+                                               NodeID weights_nid, NodeID bias_nid,
+                                               const FullyConnectedLayerInfo fc_info, const QuantizationInfo out_quant_info)
+{
+    check_nodeidx_pair(input, g);
+    ARM_COMPUTE_ERROR_ON(num_outputs == 0);
+    ARM_COMPUTE_ERROR_ON(weights_nid == EmptyNodeID);
+
+    const bool has_bias = (bias_nid != EmptyNodeID);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create fully connected node and connect
+    NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info);
+    g.add_connection(input.node_id, input.index, fc_nid, 0);
+    g.add_connection(weights_nid, 0, fc_nid, 1);
+    if(has_bias)
+    {
+        g.add_connection(bias_nid, 0, fc_nid, 2);
+    }
+
+    set_node_params(g, fc_nid, params);
+
+    return fc_nid;
+}
+
+NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
                                                ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
                                                const FullyConnectedLayerInfo fc_info,
                                                const QuantizationInfo weights_quant_info, const QuantizationInfo out_quant_info)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
 
     bool has_bias = (bias_accessor != nullptr);
@@ -450,9 +491,9 @@
 
 NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
 {
-    CHECK_NODEIDX_PAIR(scores, g);
-    CHECK_NODEIDX_PAIR(deltas, g);
-    CHECK_NODEIDX_PAIR(anchors, g);
+    check_nodeidx_pair(scores, g);
+    check_nodeidx_pair(deltas, g);
+    check_nodeidx_pair(anchors, g);
 
     NodeID nid = g.add_node<GenerateProposalsLayerNode>(info);
 
@@ -472,7 +513,7 @@
 NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
                                                    ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
 
     // Get input tensor descriptor
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
@@ -510,10 +551,10 @@
     return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
 }
 
-NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, PriorBoxLayerInfo prior_info)
+NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
 {
-    CHECK_NODEIDX_PAIR(input0, g);
-    CHECK_NODEIDX_PAIR(input1, g);
+    check_nodeidx_pair(input0, g);
+    check_nodeidx_pair(input1, g);
 
     // Create priorbox node and connect
     NodeID prior_nid = g.add_node<PriorBoxLayerNode>(prior_info);
@@ -543,8 +584,8 @@
 
 NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
 {
-    CHECK_NODEIDX_PAIR(input, g);
-    CHECK_NODEIDX_PAIR(rois, g);
+    check_nodeidx_pair(input, g);
+    check_nodeidx_pair(rois, g);
 
     NodeID nid = g.add_node<ROIAlignLayerNode>(pool_info);
 
@@ -557,17 +598,18 @@
 
 NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
 {
-    CHECK_NODEIDX_PAIR(input, g);
+    check_nodeidx_pair(input, g);
 
     // Get input tensor descriptor
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+    const DataLayout       input_data_layout = input_tensor_desc.layout;
 
     // Create mul node
     TensorDescriptor mul_desc = input_tensor_desc;
-    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(mul_desc, DataLayoutDimension::CHANNEL)];
-    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), 1);
-    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), 1);
-    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL), C);
+    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
+    mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1);
+    mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1);
+    mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C);
     NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
     NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
 
@@ -599,6 +641,11 @@
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
 
+NodeID GraphBuilder::add_stack_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, int axis)
+{
+    return create_simple_multiple_input_single_output_node<StackLayerNode>(g, params, inputs, inputs.size(), axis);
+}
+
 NodeID GraphBuilder::add_upsample_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D info, InterpolationPolicy upsampling_policy)
 {
     return create_simple_single_input_output_node<UpsampleLayerNode>(g, params, input, info, upsampling_policy);
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 57c5f9d..4f942b9 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,9 +45,6 @@
 
 void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
 {
-    // Setup graph context if not done manually
-    setup_default_graph_context(ctx);
-
     // Check if graph has been registered
     if(_workloads.find(graph.id()) != std::end(_workloads))
     {
@@ -55,7 +52,7 @@
     }
 
     // Force target to all graph construct
-    // TODO (geopin01) : Support heterogeneous execution
+    // TODO (COMPMID-2014) : Support heterogeneous execution
     Target forced_target = target;
     if(!is_target_supported(target))
     {
@@ -64,6 +61,10 @@
     }
     force_target_to_graph(graph, forced_target);
 
+    // Setup backend context
+    // TODO (COMPMID-2014) : Setup all backends needed by the graph
+    setup_requested_backend_context(ctx, forced_target);
+
     // Configure all tensors
     detail::configure_all_tensors(graph);
 
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 9850128..205ef11 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,7 @@
     _bound_edges.erase(eid);
 }
 
-const std::set<EdgeID> Tensor::bound_edges() const
+std::set<EdgeID> Tensor::bound_edges() const
 {
     return _bound_edges;
 }
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index e0ba7e2..b63672b 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -17,7 +17,7 @@
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWNISE, ARISING FROM,
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
@@ -100,5 +100,55 @@
     }
 #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
+
+ConvolutionMethod Convolution_method_from_name(const std::string &name)
+{
+    static const std::map<std::string, ConvolutionMethod> methods =
+    {
+        { "default", ConvolutionMethod::Default },
+        { "direct", ConvolutionMethod::Direct },
+        { "gemm", ConvolutionMethod::GEMM },
+        { "winograd", ConvolutionMethod::Winograd },
+    };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    try
+    {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        return methods.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    }
+    catch(const std::out_of_range &)
+    {
+        throw std::invalid_argument(name);
+    }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
+DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name)
+{
+    static const std::map<std::string, DepthwiseConvolutionMethod> methods =
+    {
+        { "default", DepthwiseConvolutionMethod::Default },
+        { "gemv", DepthwiseConvolutionMethod::GEMV },
+        { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 },
+    };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    try
+    {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        return methods.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    }
+    catch(const std::out_of_range &)
+    {
+        throw std::invalid_argument(name);
+    }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index 71ec548..4c34dd8 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,13 +104,14 @@
     }
 }
 
-void setup_default_graph_context(GraphContext &ctx)
+void setup_requested_backend_context(GraphContext &ctx, Target target)
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    if(backends::BackendRegistry::get().contains(target))
     {
-        if(backend.second->is_backend_supported())
+        const auto &backend = backends::BackendRegistry::get().find_backend(target);
+        if(backend->is_backend_supported())
         {
-            backend.second->setup_backend_context(ctx);
+            backend->setup_backend_context(ctx);
         }
     }
 }
@@ -118,12 +119,12 @@
 size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
 {
     ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
-    return descriptor.shape[get_dimension_idx(descriptor, data_layout_dimension)];
+    return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)];
 }
 
-size_t get_dimension_idx(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
 
     /* Return the index based on the data layout
      * [N C H W]
@@ -133,13 +134,13 @@
     switch(data_layout_dimension)
     {
         case DataLayoutDimension::CHANNEL:
-            return (descriptor.layout == DataLayout::NCHW) ? 2 : 0;
+            return (data_layout == DataLayout::NCHW) ? 2 : 0;
             break;
         case DataLayoutDimension::HEIGHT:
-            return (descriptor.layout == DataLayout::NCHW) ? 1 : 2;
+            return (data_layout == DataLayout::NCHW) ? 1 : 2;
             break;
         case DataLayoutDimension::WIDTH:
-            return (descriptor.layout == DataLayout::NCHW) ? 0 : 1;
+            return (data_layout == DataLayout::NCHW) ? 0 : 1;
             break;
         case DataLayoutDimension::BATCHES:
             return 3;
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index ae7f0a5..0666ec0 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,6 +81,11 @@
     _tuner.set_tune_new_kernels(enable_tuning);
 }
 
+void CLDeviceBackend::set_kernel_tuning_mode(CLTunerMode tuning_mode)
+{
+    _tuner.set_tuner_mode(tuning_mode);
+}
+
 void CLDeviceBackend::initialize_backend()
 {
     // Setup Scheduler
@@ -118,6 +123,7 @@
     }
 
     set_kernel_tuning(ctx.config().use_tuner);
+    set_kernel_tuning_mode(ctx.config().tuner_mode);
 
     // Setup a management backend
     if(ctx.memory_management_ctx(Target::CL) == nullptr)
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index b9e3ddc..90c1613 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -40,7 +40,8 @@
 /** Target specific information structure used to pass information to the layer templates */
 struct CLTargetInfo
 {
-    using TensorType = arm_compute::ICLTensor;
+    using TensorType         = arm_compute::ICLTensor;
+    using TensorConcreteType = CLTensor;
     static Target TargetType;
 };
 
@@ -69,6 +70,14 @@
     using Subtraction    = CLArithmeticSubtraction;
     using Multiplication = CLPixelWiseMultiplication;
 };
+
+/** Function and tensor types to be used inside a CL fused convolution/batch normalization layer */
+struct CLFusedLayerTypes
+{
+    using ConvolutionLayer       = CLConvolutionLayer;
+    using FuseBatchNormalization = CLFuseBatchNormalization;
+};
+
 // TODO (isagot01): Remove once we support heterogeneous scheduling at function level
 /** Wrapper for the CPP Function in the OpenCL backend **/
 class CPPWrapperFunction : public IFunction
@@ -192,6 +201,8 @@
             return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
             return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::FusedConvolutionBatchNormalizationLayer:
+            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
@@ -218,6 +229,8 @@
             return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
             return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        case NodeType::StackLayer:
+            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::create_upsample_layer<CLUpsampleLayer, CLTargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index 4b71837..cb8dc0a 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -74,6 +74,8 @@
             return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+        case NodeType::ReshapeLayer:
+            return detail::validate_reshape_layer<CLReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
             return detail::validate_roi_align_layer<CLROIAlignLayer>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
         case NodeType::SliceLayer:
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index dc987dd..690a311 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,8 @@
 /** Target specific information structure used to pass information to the layer templates */
 struct NETargetInfo
 {
-    using TensorType = arm_compute::ITensor;
+    using TensorType         = arm_compute::ITensor;
+    using TensorConcreteType = arm_compute::Tensor;
     static Target TargetType;
 };
 
@@ -76,6 +77,13 @@
     using Multiplication = NEPixelWiseMultiplication;
 };
 
+/** Function and tensor types to be used inside a NEON fused convolution/batch normalization layer */
+struct NEFusedLayerTypes
+{
+    using ConvolutionLayer       = NEConvolutionLayer;
+    using FuseBatchNormalization = NEFuseBatchNormalization;
+};
+
 namespace detail
 {
 // Specialized functions
@@ -135,8 +143,10 @@
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
-                               << " Target " << NETargetInfo::TargetType
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << func_name
+                               << " Target: " << NETargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << qss.str()
                                << " Input shape: " << input->info()->tensor_shape()
@@ -210,6 +220,8 @@
             return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
             return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::FusedConvolutionBatchNormalizationLayer:
+            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PermuteLayer:
@@ -226,6 +238,8 @@
             return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::SoftmaxLayer:
             return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        case NodeType::StackLayer:
+            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::create_upsample_layer<NEUpsampleLayer, NETargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index b0feec5..77f2e7f 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -74,6 +74,8 @@
             return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+        case NodeType::ReshapeLayer:
+            return detail::validate_reshape_layer<NEReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
         case NodeType::SliceLayer:
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index 7fc5ca0..5e31309 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -136,7 +136,7 @@
             // Then add it to the list of transition buffers
             ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
             IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
-            transition_handles.input_handles.push_back(std::make_pair(tensor_handle, mm_group));
+            transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
 
@@ -149,7 +149,7 @@
         {
             ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
             IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
-            transition_handles.output_handles.push_back(std::make_pair(tensor_handle, mm_group));
+            transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
 
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index 767154b..900be42 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -204,10 +204,13 @@
 
 bool call_all_input_node_accessors(ExecutionWorkload &workload)
 {
-    return !std::any_of(std::begin(workload.inputs), std::end(workload.inputs), [](Tensor * input_tensor)
+    bool is_valid = true;
+    std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor)
     {
-        return (input_tensor == nullptr) || !input_tensor->call_accessor();
+        bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
+        is_valid         = is_valid && valid_input;
     });
+    return is_valid;
 }
 
 void prepare_all_tasks(ExecutionWorkload &workload)
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index a170c4d..7994541 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,18 +62,19 @@
             // Get output tensor
             auto output_tensor = node->output(0);
 
-            // Check concatenation axis (Sub-tensor optimization is support for concatenation axis >=2)
+            // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
             auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
-            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc(), concat_node->concatenation_axis()) < 2)
+            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
             {
                 continue;
             }
 
-            // Check that all tensor have the same target and valid inputs
+            // Check that all tensor have the same target, valid inputs and same quantization info
             bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
                                         [&](const EdgeID & eid)
             {
-                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target);
+                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
+                       && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
             });
 
             // Create subtensors
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index d69d2cd..3d53f49 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,12 +47,12 @@
 
     // Split input
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
-    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL);
+    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
     NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
     const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
-    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc, DataLayoutDimension::BATCHES);
+    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
     NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
 
     // Split bias
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 31921b3..1c2985d 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,8 +56,8 @@
 
                 ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
 
-                // Prevent in-place operation if there is an accessor bound to the in-place tensor
-                if(new_output_tensor->accessor() == nullptr)
+                // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
+                if(new_output_tensor->accessor() == nullptr || current_output_tensor->desc().quant_info == new_output_tensor->desc().quant_info)
                 {
                     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Switching to in-place computation for the node with ID : "
                                                   << node->id() << " and name : " << node->name() << std::endl);
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 9dc02d1..427d7b5 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,11 @@
  */
 #include "arm_compute/graph/mutators/NodeFusionMutator.h"
 
-#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
 #include "arm_compute/core/utils/misc/Cast.h"
@@ -38,69 +40,156 @@
 {
 namespace detail
 {
+void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge)
+{
+    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
+
+    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
+    auto *bn_node   = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+
+    // Not fusing if number of groups is greater than 1
+    if(conv_node->num_groups() > 1)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
+                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+    // Prevent fusion if fused node has an output accessor
+    if(conv_node->output(0)->accessor() == nullptr)
+    {
+        const Target assigned_target = conv_node->assigned_target();
+
+        // Extract conv inputs
+        const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
+        const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
+        const auto   out_quant_info  = conv_node->output(0)->desc().quant_info;
+        const auto   conv_info       = conv_node->convolution_info();
+        const auto   conv_method     = conv_node->convolution_method();
+        const auto   num_groups      = conv_node->num_groups();
+        const auto   act_info        = bn_node->fused_activation();
+        FastMathHint fast_math_hint  = conv_node->fast_math_hint();
+
+        // Extract bn inputs
+        const auto bn_mean_id  = bn_node->input_edge(1)->producer_id();
+        const auto bn_var_id   = bn_node->input_edge(2)->producer_id();
+        const auto bn_beta_id  = bn_node->input_edge(3)->producer_id();
+        const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
+        const auto epsilon     = bn_node->epsilon();
+
+        // Create the fused node
+        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, out_quant_info, act_info);
+
+        if(conv_node->input_edge(2) != nullptr)
+        {
+            auto conv_bias_id = conv_node->input_edge(2)->producer_id();
+            g.add_connection(conv_bias_id, 0, fused_id, 2);
+        }
+
+        // Add connections from the conv/batch_norm inputs to the fused node
+        g.add_connection(conv_input_id, 0, fused_id, 0);
+        g.add_connection(conv_weights_id, 0, fused_id, 1);
+        g.add_connection(bn_mean_id, 0, fused_id, 3);
+        g.add_connection(bn_var_id, 0, fused_id, 4);
+        g.add_connection(bn_beta_id, 0, fused_id, 5);
+        g.add_connection(bn_gamma_id, 0, fused_id, 6);
+
+        auto                     fused_node       = g.node(fused_id);
+        std::vector<NodeIdxPair> bn_driving_nodes = get_driving_nodes(*bn_node);
+
+        // Extract batch normalization node accessor if any
+        auto bn_node_accessor = bn_node->output(0)->extract_accessor();
+        auto bn_node_name     = bn_node->name();
+
+        // Remove batch normalization node
+        g.remove_node(bn_node->id());
+
+        // Get driving nodes of batch normalization node
+        for(auto &driving_node : bn_driving_nodes)
+        {
+            g.add_connection(fused_id, 0, driving_node.node_id, driving_node.index);
+            configure_tensor(fused_node->output(0));
+        }
+        // Update fused node outputs
+        fused_node->output(0)->set_accessor(std::move(bn_node_accessor));
+        fused_node->set_assigned_target(assigned_target);
+        fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+
+        // Remove convolution node
+        g.remove_node(conv_node->id());
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+    }
+}
+
 template <typename N>
-void fuse_node_with_activation(Graph                              &g,
-                               const std::set<Activation>         &supported_fused_activations,
-                               std::function<bool(INode &)> const &prec)
+void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+{
+    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
+
+    auto *n_node   = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
+    auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
+
+    ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
+
+    // Check if activation is supported for fusion
+    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
+                                  << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+    // Prevent fusion if fused node has an output accessor
+    if(n_node->output(0)->accessor() == nullptr)
+    {
+        // Get driving nodes of activation node
+        std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
+
+        // Set activation info to fused node
+        n_node->set_fused_activation(act_node->activation_info());
+
+        // Extract activation node accessor if any
+        auto act_node_accessor = act_node->output(0)->extract_accessor();
+
+        // Remove activation node
+        g.remove_node(act_node->id());
+
+        // Update fused node outputs
+        for(auto &driving_node : act_driving_nodes)
+        {
+            g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
+        }
+
+        // Update accessor to fused node
+        n_node->output(0)->set_accessor(std::move(act_node_accessor));
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
+    }
+}
+
+template <typename N1, typename N2, typename F, typename... Args>
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
 {
     // Not interested in the order of nodes
     for(auto &node : g.nodes())
     {
         // Check if the node is of type N and not a branching node
-        if(node && node->type() == N::node_type && node->output_edges().size() == 1)
+        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
-            auto output_edge_id = *node->output_edges().begin();
-            auto output_edge    = g.edge(output_edge_id);
+            const auto output_edge_id = *node->output_edges().begin();
+            const auto output_edge    = g.edge(output_edge_id);
+
             // Check if following node is an activation layer node
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
+            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
             {
-                auto *n_node   = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
-                auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
-
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
-
-                // Check given precondition
-                if(!prec(*n_node))
-                {
-                    continue;
-                }
-                // Check if activation is supported for fusion
-                if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
-                {
-                    continue;
-                }
-
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
-                                              << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
-
-                // Prevent fusion if fused node has an output accessor
-                if(n_node->output(0)->accessor() == nullptr)
-                {
-                    // Get driving nodes of activation node
-                    std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
-
-                    // Set activation info to fused node
-                    n_node->set_fused_activation(act_node->activation_info());
-
-                    // Extract activation node accessor if any
-                    auto act_node_accessor = act_node->output(0)->extract_accessor();
-
-                    // Remove activation node
-                    g.remove_node(act_node->id());
-
-                    // Update fused node outputs
-                    for(auto &driving_node : act_driving_nodes)
-                    {
-                        g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
-                    }
-
-                    // Update accessor to fused node
-                    n_node->output(0)->set_accessor(std::move(act_node_accessor));
-                }
-                else
-                {
-                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
-                }
+                fuse_fcn(g, output_edge, optional_arguments...);
             }
         }
     }
@@ -118,20 +207,30 @@
     const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
 
     // Preconditions
-    auto empty_prec = [](INode & n)
+    auto empty_prec = [](INode &)
     {
         return true;
     };
-    auto qs8_prec = [](INode & n)
+    auto qs8_prec = [&g](INode & n)
     {
         ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
-        return n.output(0)->desc().data_type == DataType::QASYMM8;
+
+        const auto output_edge_id = *n.output_edges().begin();
+        const auto output_edge    = g.edge(output_edge_id);
+        // To perform fusion the two nodes must have same output quantization information
+        const bool same_qinfo     = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+        const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
+
+        return (output_qasymm8 && same_qinfo) || !output_qasymm8;
     };
 
     // Fusion mutations
-    detail::fuse_node_with_activation<BatchNormalizationLayerNode>(g, supported_fused_activations, empty_prec);
-    detail::fuse_node_with_activation<ConvolutionLayerNode>(g, supported_fused_activations, empty_prec);
-    detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>(g, supported_fused_activations, qs8_prec);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+
+    // TODO (COMPMID-2055): re-enable once we fuse bias and activations to convolution
+    // detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index 414684c..ada6cf9 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,8 @@
 {
 namespace graph
 {
-ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info)
-    : _info(info)
+ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info, QuantizationInfo out_quant_info)
+    : _info(info), _out_quant_info(out_quant_info)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -62,12 +62,18 @@
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
-    return src->desc();
+    TensorDescriptor output_info = src->desc();
+    if(!_out_quant_info.empty())
+    {
+        output_info.quant_info = _out_quant_info;
+    }
+
+    return output_info;
 }
 
 NodeType ActivationLayerNode::type() const
 {
-    return NodeType::ActivationLayer;
+    return ActivationLayerNode::node_type;
 }
 
 void ActivationLayerNode::accept(INodeVisitor &v)
@@ -75,4 +81,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index ade3f6e..5f13b90 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,8 @@
 {
 namespace graph
 {
-ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, DataLayoutDimension axis)
-    : _total_nodes(total_nodes), _axis(axis), _is_enabled(true)
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor)
+    : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -53,7 +53,12 @@
 
 DataLayoutDimension ConcatenateLayerNode::concatenation_axis() const
 {
-    return _axis;
+    return _concat_descriptor.axis;
+}
+
+QuantizationInfo ConcatenateLayerNode::output_quantization_info() const
+{
+    return _concat_descriptor.output_qinfo;
 }
 
 TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
@@ -62,28 +67,18 @@
     ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
 
     TensorDescriptor output_descriptor = input_descriptors[0];
-    const int        axis_idx          = get_dimension_idx(output_descriptor, axis);
+    const int        axis_idx          = get_dimension_idx(output_descriptor.layout, axis);
+    ARM_COMPUTE_ERROR_ON_MSG(axis_idx > 2, "Unsupported concatenation axis!");
 
     // Extract shapes
     std::vector<const TensorShape *> shapes;
+    shapes.reserve(input_descriptors.size());
     for(auto &input_descriptor : input_descriptors)
     {
         shapes.emplace_back(&input_descriptor.shape);
     }
 
-    // Calculate output shape
-    if(axis_idx == 0)
-    {
-        output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(shapes);
-    }
-    else if(axis_idx == 2)
-    {
-        output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(shapes);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported concatenation axis!");
-    }
+    output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(shapes, axis_idx);
 
     return output_descriptor;
 }
@@ -122,7 +117,11 @@
             ARM_COMPUTE_ERROR_ON(t == nullptr);
             inputs_descriptors.push_back(t->desc());
         }
-        output_info = compute_output_descriptor(inputs_descriptors, _axis);
+        output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis);
+        if(!_concat_descriptor.output_qinfo.empty())
+        {
+            output_info.quant_info = _concat_descriptor.output_qinfo;
+        }
     }
 
     return output_info;
@@ -138,4 +137,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index 15c7ff6..1c8dcae 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,10 +97,11 @@
 
     std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index e7ccffd..b1a6db7 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,10 +66,11 @@
                                                                             info.pad().first, info.pad().second,
                                                                             info.stride().first, info.stride().second);
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 75ca5f4..cdd9e7b 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,8 +32,9 @@
 {
 namespace graph
 {
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method)
-    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method,
+                                                             QuantizationInfo out_quant_info)
+    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(out_quant_info), _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -85,10 +86,11 @@
 
     std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
 
     return output_descriptor;
 }
@@ -113,7 +115,13 @@
 
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
-    return compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
+    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
+    if(!_out_quant_info.empty())
+    {
+        output_info.quant_info = _out_quant_info;
+    }
+
+    return output_info;
 }
 
 NodeType DepthwiseConvolutionLayerNode::type() const
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
new file mode 100644
index 0000000..c304a6c
--- /dev/null
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
+                                                                               unsigned int      num_groups,
+                                                                               ConvolutionMethod method,
+                                                                               FastMathHint      fast_math_hint,
+                                                                               QuantizationInfo out_quant_info, ActivationLayerInfo fused_activation)
+    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info), _fused_activation(fused_activation)
+{
+    _input_edges.resize(7, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void FusedConvolutionBatchNormalizationNode::set_convolution_method(ConvolutionMethod method)
+{
+    _method = method;
+}
+
+float FusedConvolutionBatchNormalizationNode::epsilon() const
+{
+    return _epsilon;
+}
+
+ConvolutionMethod FusedConvolutionBatchNormalizationNode::convolution_method() const
+{
+    return _method;
+}
+
+void FusedConvolutionBatchNormalizationNode::set_fast_math_hint(FastMathHint hint)
+{
+    _fast_math_hint = hint;
+}
+
+FastMathHint FusedConvolutionBatchNormalizationNode::fast_math_hint() const
+{
+    return _fast_math_hint;
+}
+
+PadStrideInfo FusedConvolutionBatchNormalizationNode::convolution_info() const
+{
+    return _info;
+}
+
+unsigned int FusedConvolutionBatchNormalizationNode::num_groups() const
+{
+    return _num_groups;
+}
+
+ActivationLayerInfo FusedConvolutionBatchNormalizationNode::fused_activation() const
+{
+    return _fused_activation;
+}
+
+void FusedConvolutionBatchNormalizationNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    _fused_activation = fused_activation;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                                   const TensorDescriptor &weights_descriptor,
+                                                                                   const PadStrideInfo    &info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+    const DataLayout data_layout       = input_descriptor.layout;
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+
+    return output_descriptor;
+}
+
+bool FusedConvolutionBatchNormalizationNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    const Tensor *src     = input(0);
+    const Tensor *weights = input(1);
+
+    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
+    if(!_out_quant_info.empty())
+    {
+        output_info.quant_info = _out_quant_info;
+    }
+
+    return output_info;
+}
+
+NodeType FusedConvolutionBatchNormalizationNode::type() const
+{
+    return FusedConvolutionBatchNormalizationNode::node_type;
+}
+
+void FusedConvolutionBatchNormalizationNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index 26c145a..48b93c9 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,9 +57,10 @@
 
     std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info());
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), pooled_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), pooled_height);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), pooled_width);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), pooled_height);
 
     return output_descriptor;
 }
diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index 6b83f6b..21ad451 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,10 +53,11 @@
     ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
     ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width / stride);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height / stride);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
 
     return output_descriptor;
 }
diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index a6aa7bf..a399229 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,9 +68,10 @@
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
+    const DataLayout data_layout = src->desc().layout;
     TensorDescriptor output_desc = src->desc();
-    size_t           width_idx   = get_dimension_idx(output_desc, DataLayoutDimension::WIDTH);
-    size_t           height_idx  = get_dimension_idx(output_desc, DataLayoutDimension::HEIGHT);
+    size_t           width_idx   = get_dimension_idx(data_layout, DataLayoutDimension::WIDTH);
+    size_t           height_idx  = get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT);
     output_desc.shape.set(width_idx, static_cast<int>(output_desc.shape[width_idx] * _scale_width));
     output_desc.shape.set(height_idx, static_cast<int>(output_desc.shape[height_idx] * _scale_height));
 
diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
new file mode 100644
index 0000000..d26498a
--- /dev/null
+++ b/src/graph/nodes/StackLayerNode.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/StackLayerNode.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis)
+    : _total_nodes(total_nodes), _axis(axis)
+{
+    _input_edges.resize(_total_nodes, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+int StackLayerNode::axis() const
+{
+    return _axis;
+}
+
+TensorDescriptor StackLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+                                                           int                                  axis)
+{
+    ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
+
+    TensorDescriptor output_descriptor = input_descriptors[0];
+
+    const TensorInfo   input_info(input_descriptors[0].shape, 1, input_descriptors[0].data_type);
+    const unsigned int num_tensors = input_descriptors.size();
+
+    output_descriptor.shape = arm_compute::misc::shape_calculator::compute_stack_shape(input_info, axis, num_tensors);
+
+    return output_descriptor;
+}
+
+bool StackLayerNode::forward_descriptors()
+{
+    if(_outputs[0] != NullTensorID)
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor StackLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    // Check if all input tensors are set
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
+    {
+        return eid != EmptyEdgeID;
+    });
+
+    TensorDescriptor output_info = {};
+
+    if(are_all_inputs_set)
+    {
+        std::vector<TensorDescriptor> inputs_descriptors;
+        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        {
+            const Tensor *t = _graph->tensor(input_id(i));
+            ARM_COMPUTE_ERROR_ON(t == nullptr);
+            inputs_descriptors.push_back(t->desc());
+        }
+        output_info = compute_output_descriptor(inputs_descriptors, _axis);
+    }
+
+    return output_info;
+}
+
+NodeType StackLayerNode::type() const
+{
+    return NodeType::StackLayer;
+}
+
+void StackLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/UpsampleLayerNode.cpp b/src/graph/nodes/UpsampleLayerNode.cpp
index bdd39e8..88af122 100644
--- a/src/graph/nodes/UpsampleLayerNode.cpp
+++ b/src/graph/nodes/UpsampleLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,9 +54,10 @@
     const unsigned int input_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
 
+    const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width * info.x());
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height * info.y());
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width * info.x());
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height * info.y());
 
     return output_descriptor;
 }
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index ef156ea..c939de1 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,14 @@
     _info = ss.str();
 }
 
+void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
+{
+    ARM_COMPUTE_UNUSED(n);
+    std::stringstream ss;
+    ss << "FusedConvolutionBatchNormalizationNode";
+    _info = ss.str();
+}
+
 void DotGraphVisitor::visit(NormalizationLayerNode &n)
 {
     std::stringstream ss;
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index c5d42b1..1323bb3 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -66,7 +66,7 @@
     std::vector<BlobInfo> group_sizes;
     std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
     {
-        return BlobInfo(b.max_size, b.max_alignment);
+        return BlobInfo{ b.max_size, b.max_alignment };
     });
 
     // Update blob sizes
@@ -75,7 +75,7 @@
     group_sizes.resize(max_size);
     std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
     {
-        return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
+        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment) };
     });
 
     // Calculate group mappings
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 533e6fa..8bc7b8e 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -47,7 +47,7 @@
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
@@ -55,7 +55,7 @@
     if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        cl_context_properties properties_printf[] =
+        std::array<cl_context_properties, 7> properties_printf =
         {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
@@ -65,17 +65,17 @@
             CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
             0
         };
-        std::copy_n(properties_printf, 7, prop);
+        prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        cl_context_properties properties[] =
+        std::array<cl_context_properties, 3> properties =
         {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             0
         };
-        std::copy_n(properties, 3, prop);
+        std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
 } //namespace
@@ -94,11 +94,11 @@
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device                              = platform_devices[0];
-    cl_int                err           = CL_SUCCESS;
-    cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+    device     = platform_devices[0];
+    cl_int err = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
     initialise_context_properties(p, device, properties);
-    cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+    cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
     return std::make_tuple(cl_context, device, err);
 }
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 5bea85c..557378b 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 {
 }
 
-CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index 88d45ac..2577ec0 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,9 @@
 using namespace arm_compute;
 
 CLMultiHOG::CLMultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
+    : _num_models(num_models), _model()
 {
+    _model.resize(_num_models);
 }
 
 size_t CLMultiHOG::num_models() const
@@ -42,11 +43,11 @@
 ICLHOG *CLMultiHOG::cl_model(size_t index)
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
 
 const ICLHOG *CLMultiHOG::cl_model(size_t index) const
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 865f389..6d5dba0 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
 using namespace arm_compute;
 
 CLPyramid::CLPyramid()
-    : _info(), _pyramid(nullptr)
+    : _info(), _pyramid()
 {
 }
 
@@ -51,8 +51,8 @@
 
 void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
-    _info    = info;
-    _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+    _info = info;
+    _pyramid.resize(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
@@ -109,11 +109,9 @@
 
 void CLPyramid::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
-        (_pyramid.get() + i)->allocator()->allocate();
+        _pyramid[i].allocator()->allocate();
     }
 }
 
@@ -126,5 +124,5 @@
 {
     ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
 
-    return (_pyramid.get() + index);
+    return &_pyramid[index];
 }
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 0307498..101e4f1 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 
 namespace
 {
-std::unique_ptr<ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
     std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
@@ -101,10 +101,10 @@
     info().set_is_resizable(true);
 }
 
-arm_compute::Status CLTensorAllocator::import_memory(cl::Buffer buffer)
+Status CLTensorAllocator::import_memory(cl::Buffer buffer)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() < info().total_size());
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
 
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index a262d6b..2c3f9ce 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Error.h"
@@ -31,42 +32,13 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <string>
 
 namespace arm_compute
 {
-namespace
-{
-/** Utility function used to initialize the LWS values to test.
- *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
- *
- * @param[in, out] lws         Vector of LWS to test for a specific dimension
- * @param[in]      gws         Size of the GWS
- * @param[in]      lws_max     Max LKWS value allowed to be tested
- * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
- */
-void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
-    lws.push_back(1);
-
-    for(unsigned int i = 2; i <= lws_max; ++i)
-    {
-        // Power of two condition
-        const bool is_power_of_two = (i & (i - 1)) == 0;
-
-        // Condition for the module accordingly with the mod_let_one flag
-        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
-        if(mod_cond || is_power_of_two)
-        {
-            lws.push_back(i);
-        }
-    }
-}
-} // namespace
-
 CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::NORMAL)
 {
 }
 
@@ -88,6 +60,15 @@
     return _tune_new_kernels;
 }
 
+void CLTuner::set_tuner_mode(CLTunerMode mode)
+{
+    _tuner_mode = mode;
+}
+CLTunerMode CLTuner::get_tuner_mode() const
+{
+    return _tuner_mode;
+}
+
 void CLTuner::tune_kernel_static(ICLKernel &kernel)
 {
     ARM_COMPUTE_UNUSED(kernel);
@@ -182,61 +163,53 @@
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
-    cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+    cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
 
-    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
+    // Run the kernel with default lws to be used as baseline
+    kernel.run(kernel.window(), queue_profiler);
+
+    queue_profiler.finish();
+
+    const cl_ulong start         = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+    const cl_ulong end           = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+    cl_ulong       min_exec_time = end - start;
+    _kernel_event                = nullptr;
+
     cl::NDRange opt_lws = cl::NullRange;
 
-    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
-    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
-    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
-
-    std::vector<unsigned int> lws_x;
-    std::vector<unsigned int> lws_y;
-    std::vector<unsigned int> lws_z;
-
-    // Initialize the LWS values to test
-    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
-    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
-    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
-
-    for(const auto &z : lws_z)
+    //Construct the list of LWS values to be tested based on the tuner mode.
+    auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
+    for(size_t i = 0; i < lws_list->size(); ++i)
     {
-        for(const auto &y : lws_y)
+        cl::NDRange lws_test    = (*lws_list)[i];
+        auto        x           = lws_test[0];
+        auto        y           = lws_test[1];
+        auto        z           = lws_test[2];
+        const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+        if(invalid_lws)
         {
-            for(const auto &x : lws_x)
-            {
-                cl::NDRange lws_test = cl::NDRange(x, y, z);
+            continue;
+        }
 
-                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+        //Set the Local-Workgroup-Size
+        kernel.set_lws_hint(lws_test);
 
-                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
+        // Run the kernel
+        kernel.run(kernel.window(), queue_profiler);
 
-                if(invalid_lws)
-                {
-                    continue;
-                }
+        queue_profiler.finish();
 
-                //Set the Local-Workgroup-Size
-                kernel.set_lws_hint(lws_test);
+        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        const cl_ulong diff  = end - start;
+        _kernel_event        = nullptr;
 
-                // Run the kernel
-                kernel.run(kernel.window(), queue_profiler);
-
-                queue_profiler.finish();
-
-                const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                const cl_ulong diff  = end - start;
-                _kernel_event        = nullptr;
-
-                // Check the execution time
-                if(diff < min_exec_time)
-                {
-                    min_exec_time = diff;
-                    opt_lws       = cl::NDRange(x, y, z);
-                }
-            }
+        // Check the execution time
+        if(diff < min_exec_time)
+        {
+            min_exec_time = diff;
+            opt_lws       = cl::NDRange(x, y, z);
         }
     }
 
@@ -301,7 +274,7 @@
     std::ofstream fs;
     fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     fs.open(filename, std::ios::out);
-    for(auto kernel_data : _lws_table)
+    for(auto const &kernel_data : _lws_table)
     {
         fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
     }
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 84e8709..4c7458d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -177,7 +177,7 @@
 
 void CLCannyEdge::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run sobel
     _sobel->run();
@@ -199,6 +199,4 @@
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
     CLScheduler::get().enqueue(_edge_trace, true);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 018c674..b8224d2 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 
@@ -35,56 +38,168 @@
 namespace arm_compute
 {
 CLConcatenateLayer::CLConcatenateLayer()
-    : _concat_function(nullptr)
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
 {
 }
 
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, DataLayoutDimension axis)
+void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
+    _axis       = axis;
+    _num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
+    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
     {
-        case 0:
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+    unsigned int offset = 0;
+    switch(_axis)
+    {
+        case Window::DimX:
         {
-            auto func = support::cpp14::make_unique<CLWidthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
+            switch(_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
+                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
+                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
+                        kernel->configure(inputs_vector.at(i), offset, output);
+                        offset += inputs_vector.at(i)->info()->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
             break;
         }
-        case 2:
+        case Window::DimY:
         {
-            auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
             break;
         }
         default:
-            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+            ARM_COMPUTE_ERROR("Axis not supported");
     }
 }
 
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+    const unsigned int num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch(axis)
     {
-        case 0:
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector, output));
+        case Window::DimX:
+        {
+            switch(num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for(const auto &input : inputs_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+                        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
+                        offset += input->dimension(axis);
+                    }
+                    break;
+            }
             break;
-        case 2:
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayer::validate(inputs_vector, output));
+        }
+        case Window::DimY:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
             break;
+        }
+        case Window::DimZ:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
+            break;
+        }
         default:
-            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+            ARM_COMPUTE_ERROR("Axis not supported");
     }
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
     return Status{};
 }
 
 void CLConcatenateLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
-    _concat_function->run();
+    for(auto &kernel : _concat_kernels)
+    {
+        CLScheduler::get().enqueue(*kernel, true);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 0131801..f09585e 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,13 +58,13 @@
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    int16_t conv_col[matrix_size];
-    int16_t conv_row[matrix_size];
-    _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+    std::array<int16_t, matrix_size> conv_col{ 0 };
+    std::array<int16_t, matrix_size> conv_row{ 0 };
+    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
 
     if(_is_separable)
     {
-        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
         // Manage intermediate buffers
@@ -75,8 +75,8 @@
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
         _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 
         // Allocate intermediate buffer
@@ -96,12 +96,10 @@
 
     if(_is_separable)
     {
-        _memory_group.acquire();
+        MemoryGroupResourceScope scope_mg(_memory_group);
 
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
-
-        _memory_group.release();
     }
     else
     {
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 0014e71..165d523 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,6 +75,13 @@
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info);
+            _function = std::move(f);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -111,6 +118,12 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            // Validate FFT-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -169,12 +182,20 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+    if(dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
+        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        {
+            return ConvolutionMethod::FFT;
+        }
+        if (input->dimension(idx_c) < 16)
+        {
+            return ConvolutionMethod::GEMM;
+        }
         return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
     }
 }
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
new file mode 100644
index 0000000..b22809e
--- /dev/null
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+{
+    batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
+
+    // _crop_box_ind is used to index crop_boxes and retrieve the appropriate crop box.
+    // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+    const float x0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(1, crop_box_ind)));
+    const float y0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(0, crop_box_ind)));
+    const float x1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(3, crop_box_ind)));
+    const float y1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(2, crop_box_ind)));
+    // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
+    start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+    output->info()->set_tensor_shape(out_shape);
+}
+
+inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
+{
+    bool is_width_flipped  = end[0] < start[0];
+    bool is_height_flipped = end[1] < start[1];
+    /** The number of rows out of bounds at the start and end of output. */
+    std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+    /** The number of columns out of bounds at the start and end of output. */
+    std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+    if(is_height_flipped)
+    {
+        rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+        rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+    }
+    else
+    {
+        rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+        rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+    }
+    if(is_width_flipped)
+    {
+        cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+        cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+    }
+    else
+    {
+        cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+        cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+    }
+
+    Window full_window = calculate_max_window(*output->info());
+
+    //  Full output window:
+    //  --------------------------------
+    //  |          Out of bounds       |
+    //  |          rows before         |
+    //  |------------------------------|
+    //  | Out of | In         | Out of |
+    //  | bounds | bounds     | bounds |
+    //  | cols   | elements   | cols   |
+    //  | before | copied     | after  |
+    //  |        | from input |        |
+    //  |------------------------------|
+    //  |        Out of bounds         |
+    //  |        rows after            |
+    //  |------------------------------|
+    // Use a separate output window for each section of the full output window.
+    // Fill all output rows that have no elements that are within the input bounds
+    // with the extrapolation value using memset.
+    // First for the rows before the in bounds rows.
+    if(rows_out_of_bounds[0] > 0)
+    {
+        Window slice_fill_rows_before(full_window);
+        slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+        kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
+        CLScheduler::get().enqueue(*kernel);
+    }
+
+    Window slice_in(full_window);
+    slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
+    slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+    int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+    if(rows_in_bounds > 0)
+    {
+        // Fill all elements that share a row with an in bounds element with the extrapolation value.
+        if(cols_out_of_bounds[0] > 0)
+        {
+            Window slice_fill_cols_before(slice_in);
+            slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
+            CLScheduler::get().enqueue(*kernel);
+        }
+
+        if(cols_out_of_bounds[1] > 0)
+        {
+            Window slice_fill_cols_after(slice_in);
+            slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
+            CLScheduler::get().enqueue(*kernel);
+        }
+
+        // Copy all elements within the input bounds from the input tensor.
+        int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+        if(cols_in_bounds > 0)
+        {
+            Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+            Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                                  is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+            auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+            kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
+            CLScheduler::get().enqueue(*kernel);
+        }
+    }
+
+    // Fill all rows after the in bounds elements with the extrapolation value.
+    if(rows_out_of_bounds[1] > 0)
+    {
+        Window slice_fill_rows_after(full_window);
+        slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
+        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+        kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
+        CLScheduler::get().enqueue(*kernel);
+    }
+}
+} // namespace
+
+CLCropResize::CLCropResize()
+    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+{
+}
+
+Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
+                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+    TensorInfo temp_info;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+    }
+    return Status{};
+}
+
+void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+                             InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+    _num_boxes = boxes->info()->tensor_shape()[1];
+    TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+    _input               = input;
+    _boxes               = boxes;
+    _box_ind             = box_ind;
+    _output              = output;
+    _method              = method;
+    _extrapolation_value = extrapolation_value;
+
+    // For each crop box:
+    // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]].
+    //   Possibly using a CLCropKernel and up to four CLMemsetKernels.
+    // - A tensor is required to hold this initial cropped image.
+    // - A scale function is used to resize the cropped image to the size specified by crop_size.
+    // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+    //   that will hold all final cropped and scaled 3D images using CLCopyKernel.
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        auto       crop_tensor = support::cpp14::make_unique<CLTensor>();
+        TensorInfo crop_result_info(1, DataType::F32);
+        crop_result_info.set_data_layout(DataLayout::NHWC);
+        crop_tensor->allocator()->init(crop_result_info);
+        _crop_results.emplace_back(std::move(crop_tensor));
+
+        auto       scale_tensor = support::cpp14::make_unique<CLTensor>();
+        TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+        scaled_result_info.set_data_layout(DataLayout::NHWC);
+        scale_tensor->allocator()->init(scaled_result_info);
+        _scaled_results.emplace_back(std::move(scale_tensor));
+    }
+}
+
+void CLCropResize::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+    // The contents of _boxes and _box_ind are required to calculate the shape
+    // of the initial cropped image and thus are required to configure the
+    // kernels used for cropping and scaling.
+    _boxes->map(CLScheduler::get().queue());
+    _box_ind->map(CLScheduler::get().queue());
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+        // may not be known until run-time and so the kernels cannot be configured until then.
+        uint32_t    batch_index;
+        Coordinates start{};
+        Coordinates end{};
+        configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
+
+        auto scale_kernel = support::cpp14::make_unique<CLScale>();
+        scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+        _scale.emplace_back(std::move(scale_kernel));
+
+        Window win = calculate_max_window(*_output->info());
+        win.set(3, Window::Dimension(i, i + 1, 1));
+
+        auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
+        copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
+        _copy.emplace_back(std::move(copy_kernel));
+
+        _crop_results[i]->allocator()->allocate();
+        _scaled_results[i]->allocator()->allocate();
+
+        run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+    }
+    _boxes->unmap(CLScheduler::get().queue());
+    _box_ind->unmap(CLScheduler::get().queue());
+    CLScheduler::get().sync();
+    for(auto &kernel : _scale)
+    {
+        kernel->run();
+    }
+    CLScheduler::get().sync();
+    for(auto &kernel : _copy)
+    {
+        CLScheduler::get().enqueue(*kernel, true);
+    }
+    CLScheduler::get().sync();
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9da02c1..c6f79d3 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -23,188 +23,117 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include <cmath>
 #include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _is_prepared(false)
+CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    {
+        case DeconvolutionMethod::DIRECT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
+            f->configure(input, weights, bias, output, deconv_info, weights_info);
+            _function = std::move(f);
+            break;
+        }
+        case DeconvolutionMethod::GEMM:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+            f->configure(input, weights, bias, output, deconv_info);
+            _function = std::move(f);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
                                       unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    {
+        case DeconvolutionMethod::DIRECT:
+        {
+            // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            break;
+        }
+        case DeconvolutionMethod::GEMM:
+        {
+            // Validate gemm-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    return Status{};
+}
+
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
     const DataLayout data_layout = input->data_layout();
 
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
-
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
-
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
-
-    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-    if(bias != nullptr)
+    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+        return DeconvolutionMethod::DIRECT;
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
-
-    unsigned int        padx            = 0;
-    unsigned int        pady            = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
-
-    return Status{};
+    return DeconvolutionMethod::GEMM;
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    const DataLayout data_layout = input->info()->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    _original_weights = weights;
-    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(weights, &_weights_flipped);
-
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
-
-    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
-
-    _is_prepared = weights_info.retain_internal_weights();
-
-    _memory_group.manage(&_scaled_output);
-
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int      padx            = 0;
-    unsigned int      pady            = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
-    scale_out_info.set_data_layout(data_layout);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
-    _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
-    _scaled_output.allocator()->allocate();
-}
-
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
                                      const WeightsInfo &weights_info)
 {
-    configure(input, weights, bias, output, info, 0, 0, weights_info);
+    configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
                                       const WeightsInfo &weights_info)
 {
-    return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+    return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
 }
 
 void CLDeconvolutionLayer::run()
 {
     prepare();
-
-    _memory_group.acquire();
-
-    _scale_f.run();
-    _conv_f.run();
-
-    _memory_group.release();
+    _function->run();
 }
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights flipping and mark original weights tensor as unused
-        _weights_flipped.allocator()->allocate();
-        _weights_flipped.map(true);
-        _original_weights->map(CLScheduler::get().queue(), true);
-        CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
-        _weights_flipped.unmap();
-        _original_weights->unmap(CLScheduler::get().queue());
-        _original_weights->mark_as_unused();
-
-        // Prepare convolution
-        _conv_f.prepare();
-
-        if(!_weights_flipped.is_used())
-        {
-            _weights_flipped.allocator()->free();
-        }
-
-        _is_prepared = true;
-    }
+    _function->prepare();
 }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index ce8667d..c66dff0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
     : _upsample(),
+      _memset(),
       _output(nullptr)
 {
 }
@@ -51,22 +48,13 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
+    _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample.configure(input, _output, inner_border, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
 {
-    _output->map(CLScheduler::get().queue(), true);
-    if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-    {
-        const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-        std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-    }
-    else
-    {
-        memset(_output->buffer(), 0, _output->info()->total_size());
-    }
-    _output->unmap(CLScheduler::get().queue());
-
-    CLScheduler::get().enqueue(_upsample, false);
+    CLScheduler::get().enqueue(_memset, false);
+    CLScheduler::get().enqueue(_upsample, true);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index e46647a..f687e54 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -36,8 +36,7 @@
 using namespace arm_compute;
 
 CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
+    : _concat_kernels_vector(),
       _border_handlers_vector(),
       _num_inputs(0)
 {
@@ -53,10 +52,10 @@
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector.resize(_num_inputs);
+    _border_handlers_vector.resize(_num_inputs);
 
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -82,7 +81,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int depth_offset = 0;
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 15cbfce..97b0a01 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -45,10 +45,18 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                               ActivationLayerInfo act_info)
+                                               ActivationLayerInfo act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -62,11 +70,13 @@
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
 
-    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(_needs_permute)
     {
@@ -103,7 +113,7 @@
 
     // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
 
     // Permute output if needed
     if(_needs_permute)
@@ -126,26 +136,26 @@
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int        depth_multiplier,
-                                                ActivationLayerInfo act_info, GPUTarget gpu_target)
+                                                unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
-    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
-    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(needs_permute)
     {
         TensorShape permuted_input_shape   = input->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
 
         permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
         permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -155,7 +165,8 @@
         const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
         const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
+                                                                                       dilation));
     }
     else if(is_nhwc)
     {
@@ -163,13 +174,13 @@
         {
             auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
             ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
-                                                                                           act_info));
+                                                                                           act_info, dilation));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
     }
 
     return Status{};
@@ -179,7 +190,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_permute)
     {
@@ -192,8 +203,6 @@
     {
         _permute_output_to_nhwc.run();
     }
-
-    _memory_group.release();
 }
 
 void CLDepthwiseConvolutionLayer3x3::prepare()
@@ -229,7 +238,7 @@
 }
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -238,12 +247,15 @@
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
     if(bool(can_run_optimised_3x3_kernel))
     {
         auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
-        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
         _optimised_function = std::move(f);
     }
     else
@@ -262,7 +274,7 @@
         const GPUTarget gpu_target  = CLScheduler::get().target();
 
         // Calculate output shape
-        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -283,7 +295,7 @@
         shape_im2col.set(2, weights_z);
         _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
         _im2col_kernel.set_target(gpu_target);
-        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
         CLScheduler::get().tune_kernel_static(_im2col_kernel);
 
         // Weights reshape configuration
@@ -310,7 +322,8 @@
             const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
             float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier, output_shift;
+            int   output_multiplier;
+            int   output_shift;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
             _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _output_reshaped.allocator()->allocate();
@@ -345,11 +358,14 @@
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
     if(can_run_optimised_3x3_kernel)
@@ -361,7 +377,7 @@
 
         const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
         const bool         append_bias  = (biases != nullptr) && !is_quantized;
-        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         const size_t       weights_w    = weights->dimension(idx_w);
         const size_t       weights_h    = weights->dimension(idx_h);
         const size_t       weights_z    = weights->dimension(idx_c);
@@ -375,7 +391,7 @@
         shape_im2col.set(1, conv_size);
         shape_im2col.set(2, weights_z);
         TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
         const TensorShape shape_weights_reshape(patch_size, weights_z);
         TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
@@ -405,7 +421,7 @@
     }
     else
     {
-        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
     }
     return Status{};
 }
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 6f33b2e..cdfdfc7 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,36 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-CLDequantizationLayer::CLDequantizationLayer()
-    : _dequantize_kernel()
+namespace arm_compute
 {
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
-Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
-
-    return Status{};
+    return CLDequantizationLayerKernel::validate(input, output);
 }
-
-void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
-    _dequantize_kernel.configure(input, output, min_max);
-}
-
-void CLDequantizationLayer::run()
-{
-    // Run dequantization kernel
-    CLScheduler::get().enqueue(_dequantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
new file mode 100644
index 0000000..6e14e26
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                            const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+    if(bias != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
+
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+
+    return Status{};
+}
+
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                           const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    const DataLayout data_layout = input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    _original_weights = weights;
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+    _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+
+    _is_prepared = weights_info.retain_internal_weights();
+
+    _memory_group.manage(&_scaled_output);
+
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    scale_out_info.set_data_layout(data_layout);
+    _scaled_output.allocator()->init(scale_out_info);
+
+    // configure scale function
+    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+    _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+
+    // Setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+    _scaled_output.allocator()->allocate();
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+    _flip_axis.map(true);
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    if(weights->info()->data_layout() == DataLayout::NHWC)
+    {
+        axis_data[0] = 1;
+        axis_data[1] = 2;
+    }
+    else
+    {
+        axis_data[0] = 0;
+        axis_data[1] = 1;
+    }
+    _flip_axis.unmap();
+}
+
+void CLDirectDeconvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _scale_f.run();
+    _conv_f.run();
+}
+
+void CLDirectDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
+        _weights_flipped.allocator()->allocate();
+        _flip_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
+        _conv_f.prepare();
+
+        // Free flipped weights
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
new file mode 100644
index 0000000..49b5a2a
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+{
+}
+
+void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+
+    // Decompose size to radix factors
+    const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->info()->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+    // Flags
+    _run_scale        = config.direction == FFTDirection::Inverse;
+    const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+    // Configure digit reverse
+    FFTDigitReverseKernelInfo digit_reverse_config;
+    digit_reverse_config.axis      = config.axis;
+    digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+    TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+    _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+    _memory_group.manage(&_digit_reversed_input);
+    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+    // Create and configure FFT kernels
+    unsigned int Nx = 1;
+    _num_ffts       = decomposed_vector.size();
+    _fft_kernels.resize(_num_ffts);
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+        FFTRadixStageKernelInfo fft_kernel_info;
+        fft_kernel_info.axis           = config.axis;
+        fft_kernel_info.radix          = radix_for_stage;
+        fft_kernel_info.Nx             = Nx;
+        fft_kernel_info.is_first_stage = (i == 0);
+        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+        Nx *= radix_for_stage;
+    }
+
+    // Configure scale kernel
+    if(_run_scale)
+    {
+        FFTScaleKernelInfo scale_config;
+        scale_config.scale     = static_cast<float>(N);
+        scale_config.conjugate = config.direction == FFTDirection::Inverse;
+        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+    }
+
+    // Allocate tensors
+    _digit_reversed_input.allocator()->allocate();
+    _digit_reverse_indices.allocator()->allocate();
+
+    // Init digit reverse indices
+    const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+    _digit_reverse_indices.map(CLScheduler::get().queue(), true);
+    std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+    _digit_reverse_indices.unmap(CLScheduler::get().queue());
+}
+
+Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+    // Check if FFT is decomposable
+    const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void CLFFT1D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Run digit reverse
+    CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+
+    // Run radix kernels
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+    }
+
+    // Run output scaling
+    if(_run_scale)
+    {
+        CLScheduler::get().enqueue(_scale_kernel, true);
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
new file mode 100644
index 0000000..165e784
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+
+    // Setup first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    _memory_group.manage(&_first_pass_tensor);
+    _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+    // Setup second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+    _first_pass_tensor.allocator()->allocate();
+}
+
+Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    // Create intermediate tensor info
+    TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+    // Validate first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+    // Validate second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void CLFFT2D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _first_pass_func.run();
+    _second_pass_func.run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..afb1cab
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+    const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+
+    int  pad           = 0;
+    bool is_decomposed = false;
+    while(!is_decomposed)
+    {
+        const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+        is_decomposed                = !decomposed_vector.empty();
+        if(!is_decomposed)
+        {
+            ++pad;
+        }
+    }
+    return pad;
+}
+} // namespace
+CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager),
+      _flip_weights_func(),
+      _permute_input_func(),
+      _permute_output_func(),
+      _permute_weights_func(),
+      _permute_bias_func(),
+      _pad_input_func(),
+      _pad_weights_func(),
+      _transform_input_func(memory_manager),
+      _transform_weights_func(),
+      _itransform_output_func(memory_manager),
+      _prod_func(),
+      _reduce_func(),
+      _extract_output_func(),
+      _bias_add_func(),
+      _activation_layer_func(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_bias(),
+      _permuted_output(),
+      _padded_input(),
+      _padded_weights(),
+      _flip_axis(),
+      _flipped_weights(),
+      _transformed_input(),
+      _transformed_weights(),
+      _input_weights_product(),
+      _output_product(),
+      _output_reduced(),
+      _itransformed_output(),
+      _reshaped_output(),
+      _bias_output(),
+      _original_weights(nullptr),
+      _original_bias(nullptr),
+      _is_activationlayer_enabled(false),
+      _needs_permute(false),
+      _has_bias(false),
+      _is_prepared(false)
+{
+}
+
+void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                      const ActivationLayerInfo &act_info)
+{
+    _original_weights = weights;
+    _original_bias    = biases;
+
+    // Flat if bias addition is required
+    _has_bias = biases != nullptr;
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    // Tensors to use
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
+
+    // Permute bias
+    if(biases != nullptr)
+    {
+        _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+        _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+    }
+
+    // Permute input if needed
+    _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+    if(_needs_permute)
+    {
+        _memory_group.manage(&_permuted_input);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Flip weights
+    _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+    // Pad weights
+    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+    // Transform weights
+    _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
+    _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+    // Pad input
+    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    _memory_group.manage(&_padded_input);
+    _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+    if(_needs_permute)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    // Transform input
+    _memory_group.manage(&_transformed_input);
+    _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+    _padded_input.allocator()->allocate();
+
+    // Perform product
+    _memory_group.manage(&_output_product);
+    _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+    _transformed_input.allocator()->allocate();
+
+    // Perform reduction
+    _memory_group.manage(&_output_reduced);
+    _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+    _output_product.allocator()->allocate();
+
+    // Transform output
+    _memory_group.manage(&_itransformed_output);
+    FFT2DInfo itranform_info;
+    itranform_info.direction = FFTDirection::Inverse;
+    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+    _output_reduced.allocator()->allocate();
+
+    // Reshape output
+    TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+    reshaped_shape.remove_dimension(2);
+    _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+    // Extract correct region
+    const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+    const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
+    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if(_has_bias)
+    {
+        _memory_group.manage(&_bias_output);
+    }
+    else if(_needs_permute)
+    {
+        output_to_use = &_permuted_output;
+        _memory_group.manage(&_permuted_output);
+    }
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _itransformed_output.allocator()->allocate();
+
+    // Add bias
+    if(biases != nullptr)
+    {
+        output_to_use = output;
+        if(_needs_permute)
+        {
+            output_to_use = &_permuted_output;
+            _memory_group.manage(&_permuted_output);
+        }
+        auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+        _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+        _bias_output.allocator()->allocate();
+    }
+
+    // Permute output
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_output.allocator()->allocate();
+    }
+
+    // Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.configure(output, nullptr, act_info);
+    }
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+    _flip_axis.map(true);
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    axis_data[0]   = 0;
+    axis_data[1]   = 1;
+    _flip_axis.unmap();
+}
+
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+    // Strides
+    const auto strides = conv_info.stride();
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+
+    return Status{};
+}
+
+void CLFFTConvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Transform input
+    if(_needs_permute)
+    {
+        _permute_input_func.run();
+    }
+    _pad_input_func.run();
+    _transform_input_func.run();
+
+    // Perform operations to frequency domain
+    _prod_func.run();
+    _reduce_func.run();
+
+    // Transform output
+    _itransform_output_func.run();
+    _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
+    _extract_output_func.run();
+    // Add bias
+    if(_has_bias)
+    {
+        _bias_add_func.run();
+    }
+    if(_needs_permute)
+    {
+        _permute_output_func.run();
+    }
+
+    // Run activation layer
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.run();
+    }
+}
+
+void CLFFTConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute bias to NCHW
+        if(_original_bias != nullptr)
+        {
+            _permuted_bias.allocator()->allocate();
+            _permute_bias_func.run();
+            _original_bias->mark_as_unused();
+        }
+
+        const ICLTensor *cur_weights = _original_weights;
+        // Permute weights
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_func.run();
+            cur_weights->mark_as_unused();
+            cur_weights = &_permuted_weights;
+        }
+
+        // Flip weights
+        _flipped_weights.allocator()->allocate();
+        _flip_weights_func.run();
+        cur_weights->mark_as_unused();
+
+        // Pad weights
+        _padded_weights.allocator()->allocate();
+        _pad_weights_func.run();
+        _flipped_weights.mark_as_unused();
+        CLScheduler::get().queue().finish();
+        _flipped_weights.allocator()->free();
+
+        // Transform weights to frequency domain
+        _transformed_weights.allocator()->allocate();
+        _transform_weights_func->run();
+        _padded_weights.mark_as_unused();
+        CLScheduler::get().queue().finish();
+        // Delete object and release internal memory
+        _transform_weights_func.reset();
+        _padded_weights.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d6cda91..fe2a18c 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,7 +97,7 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_non_max)
     {
@@ -129,6 +129,4 @@
     }
 
     q.flush();
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6a2aac6..7b9229c 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -231,7 +231,8 @@
     if(_is_quantized)
     {
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
             CLScheduler::get().enqueue(_accumulate_biases_kernel);
         }
     }
-
-    _memory_group.release();
 }
 
 void CLFullyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e91038f..492709f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,7 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 namespace arm_compute
@@ -41,46 +43,6 @@
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 
-namespace
-{
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
-    bool flag = true;
-
-    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
-    {
-        if((m > 1) && n < 16)
-        {
-            flag = true;
-        }
-        else
-        {
-            // COMPMID-852
-            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
-            {
-                constexpr float alpha = 3.2f;
-                constexpr float fact0 = 1.51f;
-                constexpr float fact1 = 1.66f;
-                constexpr float ops   = 12.0f;
-                const float     scale = k > 1024 ? 1.07f : 1.0f;
-                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
-            }
-            else
-            {
-                flag = false;
-            }
-        }
-    }
-    else
-    {
-        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-        flag = m != 1 && reshape_b_only_on_first_run;
-    }
-
-    return flag;
-}
-} // namespace
-
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
@@ -88,57 +50,102 @@
       _reshape_lhs_kernel(),
       _reshape_rhs_kernel(),
       _mm_reshaped_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
-      _is_interleaved_transposed(false),
       _run_addition(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
-      _is_new_gemm_reshaped(false)
+      _gemm_type(GEMMType::NATIVE)
 {
 }
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+    {
+        if((m > 1) && (n < 16))
+        {
+            gemm_type = GEMMType::RESHAPED_V1;
+        }
+        else if((m == 1) && (data_type == DataType::F32))
+        {
+            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            // COMPMID-852
+            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+            }
+            else
+            {
+                gemm_type = GEMMType::NATIVE;
+            }
+        }
 
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-    _original_b                  = b;
+        const auto workload = static_cast<float>((m * n) / 20.0f);
 
-    const ICLTensor *matrix_a = a;
-    const ICLTensor *matrix_b = b;
+        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+    }
+    else
+    {
+        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+        gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+    }
 
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    return gemm_type;
+}
+
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n          = b->info()->dimension(0);
+    const unsigned int k          = a->info()->dimension(0);
+    const GPUTarget    gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->info()->data_type();
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+
+    // Tune kernel statically
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
+
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
     bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n                         = b->info()->dimension(0);
     const unsigned int k                         = a->info()->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
 
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+
     GEMMRHSMatrixInfo rhs_info;
     rhs_info.n0         = 16 / b->info()->element_size();
     rhs_info.k0         = 1;
@@ -153,112 +160,183 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload   = static_cast<float>((m * n) / 20.0f);
-    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
-
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
-
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(_is_interleaved_transposed)
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
     {
-        reinterpret_input_as_3d = false;
-
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-        if(_is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
-
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
-        }
-        else
-        {
-            // Configure interleave kernel
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            // Configure transpose kernel
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-        }
+        _memory_group.manage(&_tmp_b);
     }
 
-    if(!_is_new_gemm_reshaped)
-    {
-        // Configure and tune matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
-                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
-                             gemm_info.fp_mixed_precision());
-        CLScheduler::get().tune_kernel_static(_mm_kernel);
-    }
+    // Configure interleave kernel
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
-    if(_is_interleaved_transposed)
-    {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
+    // Configure transpose kernel
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
-    // Configure matrix addition kernel
-    if(add_matrix_c && !use_fused_add)
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
     {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
+        _tmp_b.allocator()->allocate();
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
+    // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Manage intermediate buffers
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
 
-    // Check if we need to reshape the matrix B only on the first run
-    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one             = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->data_type();
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
+    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                         = b->dimension(0);
     const unsigned int k                         = a->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const bool         add_c                     = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one               = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                  = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -280,66 +358,21 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload             = static_cast<float>((m * n) / 20.0f);
-    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+    // Validate interleave kernel
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+    // Validate transpose kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(run_interleave_transpose)
-    {
-        reinterpret_input_as_3d = false;
-    }
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
-
-    if(run_interleave_transpose)
-    {
-        matrix_a_info = &tmp_a_info;
-        matrix_b_info = &tmp_b_info;
-
-        if(is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
-
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
-        }
-        else
-        {
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-        }
-    }
-
-    if(!is_new_gemm_reshaped)
-    {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
-                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-    }
-
-    if(add_matrix_c && !use_fused_add)
+    if(add_c && !fuse_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -348,32 +381,263 @@
     return Status{};
 }
 
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+    _original_b                  = b;
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+
+    // Select GEMMType
+    _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+
+    const bool is_gemm_v2  = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+    const bool add_c       = (beta != 0.f && c != nullptr);
+    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+    const bool fuse_add    = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+
+    switch(_gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+
+    // Configure matrix addition kernel
+    if(add_c && !fuse_add)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+
+    // Select GEMMType
+    GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+    switch(gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
+    }
+
+    return Status{};
+}
+
 void CLGEMM::run()
 {
     prepare();
 
-    _memory_group.acquire();
-
-    if(_is_interleaved_transposed)
-    {
-        // Run interleave kernel
-        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
-
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-        }
-    }
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run matrix multiply kernel
-    if(_is_new_gemm_reshaped)
+    switch(_gemm_type)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+        case GEMMType::NATIVE:
+        {
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
     }
 
     // Run matrix addition kernel
@@ -381,15 +645,13 @@
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
-
-    _memory_group.release();
 }
 
 void CLGEMM::prepare()
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 7105e85..03d516f 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -262,7 +262,7 @@
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
 
-        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+        // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
         TensorInfo info_gemm(shape_gemm, 1, data_type);
         info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
@@ -372,7 +372,9 @@
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
 
-    TensorInfo         im2col_reshaped_info, info_gemm, weights_reshaped_info;
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         weights_reshaped_info{};
     const ITensorInfo *gemm_input_to_use  = input;
     const ITensorInfo *gemm_output_to_use = output;
     const ITensorInfo *weights_to_use     = weights;
@@ -526,7 +528,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run im2col
     if(!_skip_im2col)
@@ -562,8 +564,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 void CLGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
new file mode 100644
index 0000000..bcb91e0
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+{
+    Coordinates start;
+    Coordinates end;
+
+    if(is_nchw)
+    {
+        start.set(0, deconv_info.pad_left());
+        start.set(1, deconv_info.pad_top());
+        end.set(0, output_info.dimension(0) - deconv_info.pad_right());
+        end.set(1, output_info.dimension(1) - deconv_info.pad_bottom());
+    }
+    else
+    {
+        start.set(0, 0);
+        start.set(1, deconv_info.pad_left());
+        start.set(2, deconv_info.pad_top());
+
+        end.set(0, output_info.dimension(0));
+        end.set(1, output_info.dimension(1) - deconv_info.pad_right());
+        end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
+    }
+
+    return { start, end };
+}
+} // namespace
+
+CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _gemmlowp_output_stage(),
+      _permute_input_to_nhwc(),
+      _permute_weights_to_nhwc(),
+      _reshape_weights(),
+      _transpose_weights(),
+      _deconv_reshape(),
+      _slice_gemm(),
+      _gemmlowp_final(),
+      _reshaped_weights(),
+      _reshaped_weights_t(),
+      _permuted_input(),
+      _permuted_weights(),
+      _gemm_output(),
+      _slice_gemm_input(),
+      _original_weights(),
+      _is_prepared(false),
+      _padded_input(false),
+      _is_nchw(false),
+      _is_quantized(false)
+{
+}
+
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    DataLayout data_layout  = input->data_layout();
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != deconv_info.stride().first);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) != deconv_info.stride().second);
+
+    TensorShape nhwc_weights_shape = weights->tensor_shape();
+    TensorShape nhwc_input_shape   = input->tensor_shape();
+
+    if(is_nchw)
+    {
+        permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
+        permute(nhwc_input_shape, PermutationVector(2, 0, 1));
+
+        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+
+        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+
+        CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
+        CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
+    }
+
+    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
+
+    TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
+    const TensorInfo reshaped_t_info = reshaped_info.clone()->set_is_resizable(true).set_tensor_shape(transposed_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
+
+    TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
+                                  input->dimension(idx_w),
+                                  input->dimension(idx_h),
+                                  input->dimension(idx_b));
+
+    TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
+    GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
+                                                                           gemm_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+    }
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+                                                    0, 0, deconv_info.stride().first, deconv_info.stride().second);
+    const TensorShape deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo        col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+
+    if(padded_input && is_quantized)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr,
+                                                                                                  &col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8), output, start_end.first, start_end.second));
+    }
+    else if(padded_input)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
+    }
+    else if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+    }
+
+    return Status{};
+}
+
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
+                                                                  weights->info(),
+                                                                  bias != nullptr ? bias->info() : nullptr,
+                                                                  output->info(),
+                                                                  deconv_info));
+
+    _original_weights = weights;
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+    const ICLTensor *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+
+    // If the data layout is NCHW, transform everything in NHWC. Another alternative could be to
+    // do an outer product in NCHW and then an accumulation through a reduction. This would have two
+    // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
+    // might be slower than GEMM.
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+
+        _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
+    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
+                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
+                                                   1,
+                                                   input->info()->data_type(), weights->info()->quantization_info()));
+
+    _reshape_weights.configure(weights_to_use, &_reshaped_weights);
+    _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
+
+    // Configure output stage for asymmetric quantized types
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+    }
+    else
+    {
+        _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+    }
+
+    if(_is_nchw)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    ICLTensor *deconv_reshape_output = nullptr;
+    ICLTensor *slice_output          = nullptr;
+    ICLTensor *output_stage_output   = nullptr;
+
+    if(_padded_input && _is_quantized)
+    {
+        _memory_group.manage(&_slice_gemm_input);
+        _memory_group.manage(&_gemmlowp_final);
+        deconv_reshape_output = &_gemmlowp_final;
+        output_stage_output   = &_slice_gemm_input;
+        slice_output          = output;
+    }
+    else if(_padded_input)
+    {
+        _memory_group.manage(&_slice_gemm_input);
+        deconv_reshape_output = &_slice_gemm_input;
+        slice_output          = output;
+    }
+    else if(_is_quantized)
+    {
+        _memory_group.manage(&_gemmlowp_final);
+        deconv_reshape_output = &_gemmlowp_final;
+        output_stage_output   = output;
+    }
+    else
+    {
+        deconv_reshape_output = output;
+    }
+
+    // Configure a Col2Im call to reshape the output of GEMM
+    _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _gemm_output.allocator()->allocate();
+
+    if(_is_quantized)
+    {
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+        int   output_multiplier(0);
+        int   output_shift(0);
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+        _gemmlowp_final.allocator()->allocate();
+    }
+
+    // If the input was padded, the output needs to be sliced.
+    if(_padded_input)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+        _slice_gemm_input.allocator()->allocate();
+    }
+}
+
+void CLGEMMDeconvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    if(_is_nchw)
+    {
+        _permute_input_to_nhwc.run();
+    }
+
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.run();
+    }
+    else
+    {
+        _mm_gemm.run();
+    }
+
+    CLScheduler::get().enqueue(_deconv_reshape, false);
+
+    if(_is_quantized)
+    {
+        _gemmlowp_output_stage.run();
+    }
+
+    if(_padded_input)
+    {
+        _slice_gemm.run();
+    }
+}
+
+void CLGEMMDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        if(_is_nchw)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nhwc.run();
+        }
+
+        _reshaped_weights.allocator()->allocate();
+        _reshape_weights.run();
+
+        if(_is_nchw)
+        {
+            _permuted_weights.allocator()->free();
+        }
+
+        _reshaped_weights_t.allocator()->allocate();
+        _transpose_weights.run();
+
+        // Prepare gemm
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+        else
+        {
+            _mm_gemmlowp.prepare();
+        }
+
+        // Free resources
+        if(!_reshaped_weights_t.is_used())
+        {
+            _reshaped_weights_t.allocator()->free();
+        }
+
+        _original_weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2a01db7..049db1d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
 namespace arm_compute
 {
@@ -40,17 +40,16 @@
 
 namespace
 {
-inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
+    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
-      _mm_reshaped_kernel(),
-      _mtx_a_reshape_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
       _mtx_b_reduction_kernel(),
@@ -58,7 +57,6 @@
       _offset_contribution_output_stage_kernel(),
       _vector_sum_col(),
       _vector_sum_row(),
-      _tmp_a(),
       _tmp_b(),
       _mm_result_s32(),
       _original_b(nullptr),
@@ -86,7 +84,6 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mtx_a_reshape_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
     const ICLTensor *matrix_a = a;
@@ -105,29 +102,21 @@
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
     // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
+    _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
 
     if(_is_gemm_reshaped)
     {
-        // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-        reinterpret_input_as_3d = false;
-
-        matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        _memory_group.manage(&_tmp_a);
         if(!_reshape_b_only_on_first_run)
         {
             _memory_group.manage(&_tmp_b);
         }
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-
-        // Configure transpose kernel
+        // Configure reshape RHS kernel
         _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
@@ -166,7 +155,7 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
@@ -185,7 +174,7 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
@@ -200,7 +189,6 @@
     // Allocate tensors
     if(_is_gemm_reshaped)
     {
-        _tmp_a.allocator()->allocate();
         if(!_reshape_b_only_on_first_run)
         {
             _tmp_b.allocator()->allocate();
@@ -231,11 +219,13 @@
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
-    TensorInfo        tmp_a_info{};
     TensorInfo        tmp_b_info{};
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
 
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                       = b->dimension(0);
@@ -243,35 +233,24 @@
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
-
-    // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(reshape_matrices)
-    {
-        reinterpret_input_as_3d = false;
-    }
+    bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    if(reshape_matrices)
+    if(reshape_matrix_b)
     {
-        matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-        // Validate transpose kernel
-
+        // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
 
-    TensorInfo info_vector_sum_col, info_vector_sum_row;
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
 
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
@@ -295,13 +274,13 @@
     {
         TensorInfo mm_result_s32_info{};
 
-        if(reshape_matrices)
+        if(reshape_matrix_b)
         {
             // Output tensor auto inizialitation if not yet initialized
             auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
         }
         else
         {
@@ -322,22 +301,25 @@
     }
     else
     {
-        if(reshape_matrices)
+        if(reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
         else
         {
             // Validate matrix multiply
             ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
         }
-        // Validate offset contribution kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
-                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                 c,
-                                                                                 a_offset, b_offset));
+        if(output->total_size() != 0)
+        {
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                     c,
+                                                                                     a_offset, b_offset));
+        }
     }
 
     return Status{};
@@ -347,13 +329,10 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_gemm_reshaped)
     {
-        // Run reshape matrix A
-        CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
-
         if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
@@ -370,7 +349,7 @@
     // Run matrix multiply
     if(_is_gemm_reshaped)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+        CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
     }
     else
     {
@@ -393,8 +372,6 @@
         // Run offset contribution kernel
         CLScheduler::get().enqueue(_offset_contribution_kernel, true);
     }
-
-    _memory_group.release();
 }
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
@@ -422,4 +399,4 @@
         _is_prepared = true;
     }
 }
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index f30eee1..ea803e4 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,10 +62,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_kernel_hor, false);
     CLScheduler::get().enqueue(_kernel_vert);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index fd82769..b671b23 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,10 +76,10 @@
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction        = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler.resize(num_levels - 1);
+        _vertical_border_handler.resize(num_levels - 1);
+        _horizontal_reduction.resize(num_levels - 1);
+        _vertical_reduction.resize(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -153,8 +153,8 @@
 
     if(num_levels > 1)
     {
-        _gauss5x5      = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+        _gauss5x5.resize(num_levels - 1);
+        _scale_nearest.resize(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index c50132e..d712a23 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -256,7 +256,7 @@
 void CLGenerateProposalsLayer::run()
 {
     // Acquire all the temporaries
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
     CLScheduler::get().enqueue(_compute_anchors_kernel, false);
@@ -277,8 +277,5 @@
     // Add dummy batch indexes
     CLScheduler::get().enqueue(_memset_kernel, true);
     CLScheduler::get().enqueue(_padded_copy_kernel, true);
-
-    // Release all the temporaries
-    _memory_group.release();
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 1470d5c..0931443 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,7 +95,7 @@
 
 void CLHOGDescriptor::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run gradient
     _gradient.run();
@@ -105,6 +105,4 @@
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
-
-    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 51aeaed..e509fd8 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,13 +71,11 @@
 
 void CLHOGGradient::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
-
-    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 8012c2f..f799d61 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,12 +128,11 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel.resize(_num_orient_bin_kernel);
+    _block_norm_kernel.resize(_num_block_norm_kernel);
+    _hog_detect_kernel.resize(_num_hog_detect_kernel);
+    _hog_space.resize(_num_orient_bin_kernel);
+    _hog_norm_space.resize(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -172,10 +171,10 @@
         _hog_space[i].allocator()->init(info_space);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_space.get() + i);
+        _memory_group.manage(&_hog_space[i]);
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -193,10 +192,10 @@
         _hog_norm_space[i].allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_norm_space.get() + i);
+        _memory_group.manage(&_hog_norm_space[i]);
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -212,13 +211,13 @@
     {
         const size_t idx_block_norm = input_hog_detect[i];
 
-        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
     }
 
     detection_window_strides->unmap(CLScheduler::get().queue());
 
     // Configure non maxima suppression kernel
-    _non_maxima_kernel->configure(_detection_windows, min_distance);
+    _non_maxima_kernel.configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
@@ -231,7 +230,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reset detection window
     _detection_windows->clear();
@@ -242,13 +241,13 @@
     // Run orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+        CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
     }
 
     // Run block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+        CLScheduler::get().enqueue(_block_norm_kernel[i], false);
     }
 
     // Run HOG detector kernel
@@ -262,9 +261,7 @@
     {
         // Map detection windows array before computing non maxima suppression
         _detection_windows->map(CLScheduler::get().queue(), true);
-        Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
+        Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 65ce7de..67f550d3 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@
       _gy(),
       _score(),
       _nonmax(),
-      _corners_list(nullptr),
+      _corners_list(),
       _num_corner_candidates(0),
       _corners(nullptr)
 {
@@ -84,7 +84,7 @@
     _score.allocator()->init(info_f32);
     _nonmax.allocator()->init(info_f32);
 
-    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list.resize(shape.x() * shape.y());
 
     // Manage intermediate buffers
     _memory_group.manage(&_gx);
@@ -146,20 +146,20 @@
     _score.allocator()->allocate();
 
     // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
 
     // Allocate intermediate buffers
     _nonmax.allocator()->allocate();
 
     // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+    _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
 }
 
 void CLHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
@@ -185,6 +185,4 @@
     _corners->map(CLScheduler::get().queue(), true);
     Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
     _corners->unmap(CLScheduler::get().queue());
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 2e3c6d7..136cb5e 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,9 @@
 
 void CLL2NormalizeLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
     CLScheduler::get().enqueue(_normalize_kernel, true);
-
-    _memory_group.release();
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index f01b1b8..4606a66 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -43,10 +43,11 @@
       _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
       _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
       _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
-      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
-      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
-      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
-      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+      _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+      _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
+      _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
+      _is_prepared(false)
 {
 }
 
@@ -93,25 +94,38 @@
                                                      lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
-
     // Configure block that calculates the forget gate
     // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
-    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias
     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    _memory_group.manage(&_forget_gate_out1);
-    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+    std::vector<const ICLTensor *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
+
     _memory_group.manage(&_forget_gate_out2);
-    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
-    _memory_group.manage(&_forget_gate_out3);
-    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
-    _forget_gate_out2.allocator()->allocate();
+    _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+
+    std::vector<const ICLTensor *> weights_vector;
+
+    weights_vector.emplace_back(input_to_forget_weights);
+    weights_vector.emplace_back(recurrent_to_forget_weights);
+    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
+
+    _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
-    _forget_gate_out1.allocator()->allocate();
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+    _memory_group.manage(&_forget_gate_out1);
+    _memory_group.manage(&_forget_gate_out3);
+    _forget_gate_out6.allocator()->allocate();
+
     CLTensor *forget_gate_out = &_forget_gate_out5;
     if(lstm_params.has_peephole_opt())
     {
@@ -134,43 +148,46 @@
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
+    // We optimize this as follows:
+    // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
         _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
     else
     {
-        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
-        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        std::vector<const ICLTensor *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
+
+        _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
 
         _memory_group.manage(&_input_gate_out1);
-        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
-        _memory_group.manage(&_input_gate_out2);
-        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+
         _memory_group.manage(&_input_gate_out3);
-        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
-        _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
-        _input_gate_out3.allocator()->allocate();
-        input_gate_out = &_input_gate_out4;
+
+        input_gate_out = &_input_gate_out3;
         if(_run_peephole_opt)
         {
-            _memory_group.manage(&_input_gate_out5);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _memory_group.manage(&_input_gate_out4);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
-            _input_gate_out5.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
         }
         else
@@ -215,35 +232,39 @@
 
     // Configure block that calculates the output
     // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
-    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
-    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    std::vector<const ICLTensor *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
+
+    _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
 
     _memory_group.manage(&_output1);
-    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
-    _memory_group.manage(&_output2);
-    _transpose_output.configure(recurrent_to_output_weights, &_output2);
-    _memory_group.manage(&_output3);
-    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _memory_group.manage(&_output4);
+
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
     _output2.allocator()->allocate();
-    _memory_group.manage(&_output5);
-    _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
-    _output3.allocator()->allocate();
-    CLTensor *output_gate_out = &_output5;
+    _forget_gate_out2.allocator()->allocate();
+
+    CLTensor *output_gate_out = &_output4;
     if(lstm_params.has_peephole_opt())
     {
-        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+        _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
-        _memory_group.manage(&_output4);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
-        _output5.allocator()->allocate();
+        _memory_group.manage(&_output3);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
         // Allocate intermediate buffers
-        _output4.allocator()->allocate();
+        _output3.allocator()->allocate();
     }
     else
     {
@@ -369,8 +390,15 @@
 
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
+    std::vector<const ITensorInfo *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -388,9 +416,15 @@
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
 
+        std::vector<const ITensorInfo *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+
         ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
         if(lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -419,10 +453,15 @@
                                                                                                                     cell_threshold)));
     }
 
+    std::vector<const ITensorInfo *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
     // Validate output gate tmp
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -464,12 +503,13 @@
 
 void CLLSTMLayer::run()
 {
-    _memory_group.acquire();
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    CLScheduler::get().enqueue(_concat_inputs_forget_gate);
 
     _fully_connected_forget_gate.run();
-    CLScheduler::get().enqueue(_transpose_forget_gate);
-    _gemm_forget_gate.run();
-    CLScheduler::get().enqueue(_accum_forget_gate1);
 
     if(_run_peephole_opt)
     {
@@ -480,24 +520,13 @@
 
     if(_run_cifg_opt)
     {
-        _ones.map(true);
-        if(_ones.info()->data_type() == DataType::F16)
-        {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
-        }
-        else
-        {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
-        }
-        _ones.unmap();
+        CLScheduler::get().enqueue(_ones_memset_kernel);
         CLScheduler::get().enqueue(_subtract_input_gate);
     }
     else
     {
         _fully_connected_input_gate.run();
-        CLScheduler::get().enqueue(_transpose_input_gate);
-        _gemm_input_gate.run();
-        CLScheduler::get().enqueue(_accum_input_gate1);
+
         if(_run_peephole_opt)
         {
             CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
@@ -521,9 +550,6 @@
     }
 
     _fully_connected_output.run();
-    CLScheduler::get().enqueue(_transpose_output);
-    _gemm_output.run();
-    CLScheduler::get().enqueue(_accum_output1);
 
     if(_run_peephole_opt)
     {
@@ -548,6 +574,18 @@
     CLScheduler::get().enqueue(_copy_output);
 
     _concat_scratch_buffer.run();
+}
 
-    _memory_group.release();
+void CLLSTMLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        CLScheduler::get().enqueue(_concat_weights_forget_gate);
+        if(!_run_cifg_opt)
+        {
+            CLScheduler::get().enqueue(_concat_weights_input_gate);
+        }
+        CLScheduler::get().enqueue(_concat_weights_output);
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 559b57f..a118518 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,8 +70,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+    _convf.resize(_num_levels);
+    _subf.resize(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 911c9b3..13116bf 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
+    _addf.resize(num_levels);
+    _scalef.resize(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
@@ -85,7 +85,7 @@
 
 void CLLaplacianReconstruct::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
 
     const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
 
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 5c6bef9..3e99dde 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -176,7 +176,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -186,8 +186,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
-
-    _memory_group.release();
 }
 
 void CLLocallyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 157f306..8517b59 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,7 +104,7 @@
 template <typename T>
 void CLMeanStdDev::run_float()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Perform reduction on x-axis
     _reduction_operation_mean.run();
@@ -140,8 +140,6 @@
         _reduction_output_stddev.unmap();
     }
     _reduction_output_mean.unmap();
-
-    _memory_group.release();
 }
 
 void CLMeanStdDev::run_int()
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index d00b1b5..a013a1f 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,12 +84,12 @@
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel   = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
-    _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
-    _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
-    _func_scharr           = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
-    _scharr_gx             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
-    _scharr_gy             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _tracker_init_kernel.resize(_num_levels);
+    _tracker_stage0_kernel.resize(_num_levels);
+    _tracker_stage1_kernel.resize(_num_levels);
+    _func_scharr.resize(_num_levels);
+    _scharr_gx.resize(_num_levels);
+    _scharr_gy.resize(_num_levels);
 
     // Create internal keypoint arrays
     _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
@@ -118,8 +118,8 @@
         _scharr_gy[i].allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_scharr_gx.get() + i);
-        _memory_group.manage(_scharr_gy.get() + i);
+        _memory_group.manage(&_scharr_gx[i]);
+        _memory_group.manage(&_scharr_gy[i]);
 
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -149,7 +149,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int level = _num_levels; level > 0; --level)
     {
@@ -167,6 +167,4 @@
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 3aa1b1e..99e3121 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -25,39 +25,293 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    // Copy the input to the output
-    _copy_kernel.configure(input, output, padding);
-
-    // Set the pages of the output to zero
+    // Set the pages of the output to the constant_value.
     _memset_kernel.configure(output, constant_value);
 
-    // Fill padding on the first two dimensions with zeros
-    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
+    // Fill out padding list with zeroes.
+    PaddingList padding_extended = padding;
+    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    {
+        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+    }
+
+    // Create a window within the output tensor where the input will be copied.
+    Window copy_window = Window();
+    for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
+    {
+        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
+    }
+    // Copy the input to the output, leaving the padding filled with the constant_value.
+    _copy_kernel.configure(input, output, PaddingList(), &copy_window);
 }
 
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+    int64_t last_padding_dimension = _padding.size() - 1;
+    // Reflecting can be performed by effectively unfolding the input as follows:
+    // For each dimension starting at DimX:
+    //      Create a before and after slice, which values depend on the selected padding mode
+    //      Concatenate the before and after padding with the tensor to be padded
 
+    // Two strided slice functions will be required for each dimension padded as well as a
+    // concatenate function and the tensors to hold the temporary results.
+    _slice_functions.resize(2 * _num_dimensions);
+    _slice_results.resize(2 * _num_dimensions);
+    _concat_functions.resize(_num_dimensions);
+    _concat_results.resize(_num_dimensions - 1);
+
+    Coordinates starts_before{};
+    Coordinates ends_before{};
+    Coordinates starts_after{};
+    Coordinates ends_after{};
+    Coordinates strides{};
+    ICLTensor *prev = input;
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+        if(i > 0)
+        {
+            strides.set(i - 1, 1);
+        }
+
+        if(_padding[i].first > 0 || _padding[i].second > 0)
+        {
+            // Set the starts, ends, and strides values for the current dimension.
+            // Due to the bit masks passed to strided slice, the values below the current dimension in
+            // starts and ends will be ignored so do not need to be modified.
+            if(_mode == PaddingMode::REFLECT)
+            {
+                starts_before.set(i, _padding[i].first);
+                ends_before.set(i, 0);
+                starts_after.set(i, input->info()->dimension(i) - 2);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+                strides.set(i, -1);
+            }
+            else
+            {
+                starts_before.set(i, _padding[i].first - 1);
+                ends_before.set(i, -1);
+                starts_after.set(i, input->info()->dimension(i) - 1);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+                strides.set(i, -1);
+            }
+
+            // Strided slice wraps negative indexes around to the end of the range,
+            // instead this should indicate use of the full range and so the bit mask will be modified.
+            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+            // Reflect the input values for the padding before and after the input.
+            std::vector<ICLTensor *> concat_vector;
+            if(_padding[i].first > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    concat_vector.push_back(&_slice_results[2 * i]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            concat_vector.push_back(prev);
+            if(_padding[i].second > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    concat_vector.push_back(&_slice_results[2 * i + 1]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            // Concatenate the padding before and after with the input.
+            ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
+            _concat_functions[i].configure(concat_vector, out, i);
+            prev = out;
+        }
+    }
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        if((static_cast<int32_t>(i) != last_padding_dimension))
+        {
+            _concat_results[i].allocator()->allocate();
+        }
+        _slice_results[2 * i].allocator()->allocate();
+        _slice_results[2 * i + 1].allocator()->allocate();
+    }
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+    _padding = padding;
+    _mode    = mode;
+
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+    int64_t last_padding_dimension = _padding.size() - 1;
+    for(; last_padding_dimension >= 0; --last_padding_dimension)
+    {
+        if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
+        {
+            break;
+        }
+    }
+    _num_dimensions = last_padding_dimension + 1;
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                configure_constant_mode(input, output, padding, constant_value);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                configure_reflect_symmetric_mode(input, output);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        // Copy the input to the whole output if no padding is applied
+        _copy_kernel.configure(input, output);
+    }
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+    // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
+    PaddingList padding_extended = padding;
+    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    {
+        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+    }
+
+    Window copy_window = Window();
+    for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+    {
+        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
+    }
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
+    }
+
+    switch(mode)
+    {
+        case PaddingMode::CONSTANT:
+        {
+            break;
+        }
+        case PaddingMode::REFLECT:
+        case PaddingMode::SYMMETRIC:
+        {
+            for(uint32_t i = 0; i < padding.size(); ++i)
+            {
+                if(mode == PaddingMode::REFLECT)
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+                }
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid mode");
+        }
+    }
     return Status{};
 }
 
 void CLPadLayer::run()
 {
-    CLScheduler::get().enqueue(_memset_kernel, false);
-    CLScheduler::get().enqueue(_fillborder_kernel, false);
-    CLScheduler::get().enqueue(_copy_kernel, true);
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                CLScheduler::get().enqueue(_memset_kernel, false);
+                CLScheduler::get().enqueue(_copy_kernel, true);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                {
+                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    {
+                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i].run();
+                        }
+                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i + 1].run();
+                        }
+                        CLScheduler::get().sync();
+                        _concat_functions[i].run();
+                        CLScheduler::get().sync();
+                    }
+                }
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_copy_kernel, true);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index b4c20db..959464c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
@@ -54,3 +54,26 @@
 {
     return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index a13859c..df10e1e 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,54 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-CLQuantizationLayer::CLQuantizationLayer()
-    : _quantize_kernel(), _min_max_kernel(), _min_max()
+namespace arm_compute
 {
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
 Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-    TensorInfo min_max{ input->num_channels(), input->data_type() };
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
-
-    return Status{};
+    return CLQuantizationLayerKernel::validate(input, output);
 }
-
-void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
-    _min_max_kernel.configure(input, &_min_max);
-
-    // Configure quantize kernel
-    _quantize_kernel.configure(input, output, &_min_max);
-
-    // Allocate min_max tensor
-    _min_max.allocator()->allocate();
-}
-
-void CLQuantizationLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset min and max
-    _min_max_kernel.reset(q);
-
-    // Run min-max kernel
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-
-    // Run quantize kernel
-    CLScheduler::get().enqueue(_quantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 63f00ac..19eb69f 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _fully_connected_kernel.run();
     _gemm_state_f.run();
@@ -114,8 +114,6 @@
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
-
-    _memory_group.release();
 }
 
 void CLRNNLayer::prepare()
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index b2d0f81..a3634cd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
@@ -40,10 +41,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    _reduction_ops     = reduction_axis.num_dimensions();
-    _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
-    _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-    _keep_dims         = keep_dims;
+    _reduction_ops = reduction_axis.num_dimensions();
+    _reduction_kernels.resize(_reduction_ops);
+    _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims = keep_dims;
 
     Coordinates axis_local = reduction_axis;
     const int   input_dims = input->info()->num_dimensions();
@@ -57,9 +58,9 @@
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
@@ -68,8 +69,8 @@
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
-            _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+            _memory_group.manage(&_reduced_outs[i]);
+            _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -92,13 +93,15 @@
             out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
     }
 }
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
     TensorShape out_shape = input->tensor_shape();
@@ -140,7 +143,7 @@
 
 void CLReduceMean::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
@@ -151,6 +154,5 @@
     {
         _reshape.run();
     }
-    _memory_group.release();
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 3d82e3f..9f99d2d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -71,7 +71,7 @@
     else
     {
         // Create temporary tensor infos
-        auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+        std::vector<TensorInfo> sums_vector(num_of_stages - 1);
 
         // Create intermediate tensor info
         TensorShape shape{ input->tensor_shape() };
@@ -110,17 +110,17 @@
         }
 
         // Validate ReductionOperation only on first kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
 
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
     }
 
     return Status{};
@@ -133,7 +133,7 @@
     _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+    _reduction_kernels_vector.resize(_num_of_stages);
 
     // Create temporary tensors
     if(_is_serial)
@@ -142,8 +142,8 @@
     }
     else
     {
-        _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _border_handlers_vector.resize(_num_of_stages);
+        _results_vector.resize(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
@@ -152,7 +152,7 @@
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_results_vector.get());
+        _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
         ReductionOperation intermediate_kernel_op;
@@ -183,30 +183,30 @@
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
         _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_results_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _memory_group.manage(&_results_vector[i]);
+            _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
             _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
 void CLReductionOperation::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_serial)
     {
@@ -220,6 +220,4 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index d4bc855..22fbef1 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 6083090..9b38f69 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d671846..7e41dba 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -191,7 +191,7 @@
 
 void CLSoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_flattening)
     {
@@ -205,9 +205,6 @@
     {
         CLScheduler::get().enqueue(_reshape_kernel, true);
     }
-
-    // Relase intermediate buffers
-    _memory_group.release();
 }
 
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index f084351..8d37d53 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,8 @@
 void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
 {
     // Create Slice functions
-    _num_outputs     = outputs.size();
-    _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+    _num_outputs = outputs.size();
+    _slice_functions.resize(_num_outputs);
 
     // Get output shape
     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 71327fe..2700b49 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -46,8 +46,8 @@
 
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
-    _num_inputs    = input.size();
-    _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+    _num_inputs = input.size();
+    _stack_kernels.resize(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 428d091..eb1dd8c 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,7 +74,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_axis(axis, input->info());
     _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
-    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+    _strided_slice_vector.resize(_num_slices);
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index d0801a6..a8667c3 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -51,7 +51,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo        tmp_output_info = *output->clone();
-    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     switch(num_inputs)
@@ -90,7 +90,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -109,7 +109,7 @@
             break;
         default:
             // Configure generic case WidthConcatenate kernels
-            _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+            _concat_kernels_vector.resize(_num_inputs);
 
             unsigned int width_offset = 0;
             for(unsigned int i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 069196e..d3c3f98 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,11 @@
         output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
                              kernel_dims.height == 1 ? 1U : 4U);
     }
+    else if(kernel_max_dim == 7U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
+                             kernel_dims.height == 1 ? 1U : 2U);
+    }
 
     return output_tile;
 }
@@ -73,7 +78,8 @@
 
     std::vector<WinogradConfiguration> fast_math_winograd =
     {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
     };
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -198,7 +204,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input transform
     _input_transform.run();
@@ -208,8 +214,6 @@
 
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
-
-    _memory_group.release();
 }
 
 void CLWinogradConvolutionLayer::prepare()
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
deleted file mode 100644
index cd97849..0000000
--- a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-namespace
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                        bool lhs_interleave, bool rhs_interleave)
-{
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Configure GEMMLHSMatrixInfo
-    lhs_info.m0         = m0;
-    lhs_info.k0         = k0;
-    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
-    lhs_info.interleave = lhs_interleave;
-    lhs_info.transpose  = false;
-
-    // Configure GEMMRHSMatrixInfo
-    rhs_info.n0         = n0;
-    rhs_info.k0         = lhs_info.k0;
-    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
-    rhs_info.interleave = rhs_interleave;
-    rhs_info.transpose  = true;
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-} // namespace
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
-    ARM_COMPUTE_UNUSED(data_type);
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    // Configurations for Mali-G76
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
-    };
-
-    // Configurations for Mali-G7x
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
-    };
-
-    switch(gpu_target)
-    {
-        case GPUTarget::G76:
-            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
-        default:
-            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
new file mode 100644
index 0000000..30fd558
--- /dev/null
+++ b/src/runtime/CL/tuners/CLLWSList.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+size_t CLLWSList::size()
+{
+    return search_space_shape.total_size();
+}
+
+cl::NDRange CLLWSListExhaustive::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return cl::NDRange{ coords[0] + 1U, coords[1] + 1U, coords[2] + 1U };
+}
+
+CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws)
+{
+    ARM_COMPUTE_UNUSED(gws);
+    search_space_shape = TensorShape(max_lws_supported_x,
+                                     max_lws_supported_y,
+                                     max_lws_supported_z);
+}
+
+cl::NDRange CLLWSListNormal::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return cl::NDRange{ _lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]] };
+}
+
+CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws)
+{
+    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+    // Initialize the LWS values to test
+    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
+
+    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if(mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+
+CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws)
+{
+    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+
+    // Initialize the LWS values to test
+    initialize_lws_values(_lws_x, lws_x_max);
+    initialize_lws_values(_lws_y, lws_y_max);
+    initialize_lws_values(_lws_z, lws_z_max);
+
+    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; i *= 4)
+    {
+        lws.push_back(i);
+    }
+}
+} // namespace cl_tuner
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 79e619c..9a141cb 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -600,7 +600,7 @@
         if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto it : indices)
+            for(auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
@@ -614,7 +614,7 @@
                 for(auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
-                    score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                    score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
                 }
             }
 
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index f3355a7..f7240db 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,16 +54,16 @@
 /* Make sure the bits we care about are defined, just in case asm/hwcap.h is
  * out of date (or for bare metal mode) */
 #ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10)
-#endif /* HWCAP_ASIMDHP */
+#define HWCAP_ASIMDHP (1 << 10) // NOLINT
+#endif                          /* HWCAP_ASIMDHP */
 
 #ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif /* HWCAP_CPUID */
+#define HWCAP_CPUID (1 << 11) // NOLINT
+#endif                        /* HWCAP_CPUID */
 
 #ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20)
-#endif /* HWCAP_ASIMDDP */
+#define HWCAP_ASIMDDP (1 << 20) // NOLINT
+#endif                          /* HWCAP_ASIMDDP */
 
 namespace
 {
@@ -146,12 +146,12 @@
                 break;
         }
     }
-    else if(implementer == 0x48) // HiSilicon CPUs
+    else if(implementer == 0x48)
     {
         // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
         switch(cpunum)
         {
-            case 0xd40: // A76 (Kirin 980)
+            case 0xd40: // A76
                 model = CPUModel::GENERIC_FP16_DOT;
                 break;
             default:
@@ -220,8 +220,8 @@
 
         while(bool(getline(file, line)))
         {
-            regmatch_t match[2];
-            ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+            std::array<regmatch_t, 2> match;
+            ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -244,7 +244,7 @@
                 continue;
             }
 
-            ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -254,7 +254,7 @@
                 continue;
             }
 
-            ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -264,7 +264,7 @@
                 continue;
             }
 
-            ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -274,7 +274,7 @@
                 continue;
             }
 
-            ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -302,8 +302,7 @@
 
 int get_max_cpus()
 {
-    int max_cpus = 1;
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    int           max_cpus = 1;
     std::ifstream CPUspresent;
     CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
     bool success = false;
@@ -341,7 +340,6 @@
     {
         max_cpus = std::thread::hardware_concurrency();
     }
-#endif /* BARE_METAL */
     return max_cpus;
 }
 #endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
@@ -427,8 +425,8 @@
         std::string line;
         while(bool(getline(cpuinfo, line)))
         {
-            regmatch_t match[2];
-            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+            std::array<regmatch_t, 2> match;
+            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index 3431834..9e6fce4 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,12 +31,11 @@
 using namespace arm_compute;
 
 Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
+    : IDistribution1D(num_bins, offset, range), _data(num_bins)
 {
 }
 
 uint32_t *Distribution1D::buffer() const
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == _data);
-    return _data.get();
+    return _data.data();
 }
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index fed4a15..f1457c4 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 {
 }
 
-GCMemory::GCMemory(std::shared_ptr<IGCMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index f781273..6a39e7c 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,7 +97,7 @@
     ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
     ARM_COMPUTE_UNUSED(egl_extension_st);
 
-    const EGLint config_attribs[] =
+    const std::array<EGLint, 3> config_attribs =
     {
         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
         EGL_NONE
@@ -105,7 +105,7 @@
     EGLConfig cfg;
     EGLint    count;
 
-    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+    res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count);
 
     ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
     ARM_COMPUTE_UNUSED(res);
@@ -114,7 +114,7 @@
 
     ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
 
-    const EGLint attribs[] =
+    const std::array<EGLint, 3> attribs =
     {
         EGL_CONTEXT_CLIENT_VERSION, 3,
         EGL_NONE
@@ -122,7 +122,7 @@
     _context = eglCreateContext(_display,
                                 cfg,
                                 EGL_NO_CONTEXT,
-                                attribs);
+                                attribs.data());
 
     ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
     ARM_COMPUTE_UNUSED(res);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
new file mode 100644
index 0000000..506f648
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+GCConcatenateLayer::GCConcatenateLayer()
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimZ)
+{
+}
+
+void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+    _axis       = axis;
+
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
+    unsigned int offset = 0;
+    switch(axis)
+    {
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+void GCConcatenateLayer::run()
+{
+    for(auto &kernel : _concat_kernels)
+    {
+        GCScheduler::get().dispatch(*kernel, true);
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index a35a18a..61c0740 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -201,7 +201,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run im2col
     GCScheduler::get().dispatch(_fill_border);
@@ -216,8 +216,6 @@
     GCScheduler::get().dispatch(_output_col2im_kernel, false);
     GCScheduler::get().memory_barrier();
 
-    _memory_group.release();
-
     // Run Activation Layer
     if(_is_activationlayer_enabled)
     {
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index aa937a6..b89aafa 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
@@ -47,13 +47,18 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<GCDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector.reserve(_num_inputs);
+    _border_handlers_vector.reserve(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
+        auto concat_kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+        auto border_kernel = support::cpp14::make_unique<GCFillBorderKernel>();
+
+        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
+        _border_handlers_vector.emplace_back(std::move(border_kernel));
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
@@ -63,8 +68,8 @@
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        GCScheduler::get().dispatch(_border_handlers_vector[i], false);
+        GCScheduler::get().dispatch(*_border_handlers_vector[i].get(), false);
         GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(_concat_kernels_vector[i], true);
+        GCScheduler::get().dispatch(*_concat_kernels_vector[i].get(), true);
     }
 }
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index ba05838..0f772bd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -36,8 +36,10 @@
 }
 
 void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
+    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
+    ARM_COMPUTE_UNUSED(dilation);
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
     k->configure(input, weights, biases, output, conv_info, depth_multiplier);
     _kernel = std::move(k);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 6b8e341..a208545 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,7 +150,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -173,8 +173,6 @@
 
         GCScheduler::get().dispatch(_accumulate_biases_kernel);
     }
-
-    _memory_group.release();
 }
 
 void GCFullyConnectedLayer::prepare()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 8ae91ee..ddfe590 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_interleaved_transposed)
     {
@@ -187,8 +187,6 @@
         GCScheduler::get().memory_barrier();
         GCScheduler::get().dispatch(_ma_kernel);
     }
-
-    _memory_group.release();
 }
 
 void GCGEMM::prepare()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index 2569365..8f60279 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -56,13 +56,11 @@
 
 void GCNormalizationLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     GCScheduler::get().dispatch(_multiply_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel, true);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index dad42cd..0645ae7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,13 +69,11 @@
 
 void GCSoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     GCScheduler::get().dispatch(_max_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 01640bb..e9f38c4 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,19 @@
 using namespace arm_compute;
 
 HOG::HOG()
-    : IHOG(), _info(), _descriptor(nullptr)
+    : IHOG(), _info(), _descriptor()
 {
 }
 
 void HOG::init(const HOGInfo &input)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
-    _info       = input;
-    _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
+    _info = input;
+    _descriptor.resize(_info.descriptor_size());
 }
 
 float *HOG::descriptor() const
 {
-    return _descriptor.get();
+    return _descriptor.data();
 }
 
 const HOGInfo *HOG::info() const
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index eb9051c..0db5217 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,23 +28,23 @@
 using namespace arm_compute;
 
 LutAllocator::LutAllocator()
-    : _buffer(nullptr)
+    : _buffer()
 {
 }
 
 uint8_t *LutAllocator::data() const
 {
-    return _buffer.get();
+    return _buffer.data();
 }
 
 void LutAllocator::allocate()
 {
-    _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
+    _buffer.resize(size());
 }
 
 uint8_t *LutAllocator::lock()
 {
-    return _buffer.get();
+    return _buffer.data();
 }
 
 void LutAllocator::unlock()
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index d116624..c6b956d 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,8 +32,8 @@
 {
 }
 
-Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index e0b60b1..154bbd7 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,9 @@
 using namespace arm_compute;
 
 MultiHOG::MultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
+    : _num_models(num_models), _model()
 {
+    _model.resize(_num_models);
 }
 
 size_t MultiHOG::num_models() const
@@ -42,11 +43,11 @@
 IHOG *MultiHOG::model(size_t index)
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
 
 const IHOG *MultiHOG::model(size_t index) const
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index d33e134..6863bb0 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -57,15 +57,13 @@
 
 void NEArgMinMaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_run_fill_border)
     {
         NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     }
     NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..a4db1fd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+    k->configure(input, block_shape, output);
+    _kernel = std::move(k);
+}
+
+void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+    k->configure(input, block_shape_x, block_shape_y, output);
+    _kernel = std::move(k);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+    return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+    return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 0e5d50f..032e617 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run sobelNxN
     _sobel->run();
@@ -184,6 +184,4 @@
 
     // Run edge tracing
     NEScheduler::get().schedule(&_edge_trace, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 21ab47d..71af560 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,56 +38,111 @@
 namespace arm_compute
 {
 NEConcatenateLayer::NEConcatenateLayer()
-    : _concat_function(nullptr)
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
 {
 }
 
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
+    _axis       = axis;
+    _num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    std::vector<ITensorInfo *> inputs_vector_info;
+    inputs_vector_info.reserve(_num_inputs);
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
-        case 0:
+        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+    unsigned int offset = 0;
+
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        switch(_axis)
         {
-            auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
-            break;
+            case Window::DimX:
+            {
+                auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimY:
+            {
+                auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimZ:
+            {
+                auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
         }
-        case 2:
-        {
-            auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+        offset += inputs_vector.at(i)->info()->dimension(_axis);
     }
 }
 
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
 
-    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    unsigned int offset = 0;
+    for(const auto &input : inputs_vector)
     {
-        case 0:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
-            break;
-        case 2:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
-            break;
-        default:
-            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        switch(axis)
+        {
+            case Window::DimX:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            case Window::DimY:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            case Window::DimZ:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
+        }
+        offset += input->dimension(axis);
     }
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
     return Status{};
 }
 
 void NEConcatenateLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
-    _concat_function->run();
+    for(auto &kernel : _concat_kernels)
+    {
+        NEScheduler::get().schedule(kernel.get(), _axis);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index b84dfd3..973855e 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,12 +103,10 @@
 
     if(_is_separable)
     {
-        _memory_group.acquire();
+        MemoryGroupResourceScope scope_mg(_memory_group);
 
         NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
         NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
-        _memory_group.release();
     }
     else
     {
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 5059162..a62459b 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -73,6 +73,13 @@
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info);
+            _function = std::move(f);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -97,6 +104,10 @@
         case ConvolutionMethod::DIRECT:
             //Validate Gemm-based Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+        case ConvolutionMethod::FFT:
+            // Validate FFT-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -148,12 +159,22 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+    if(dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
-
-    return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    else
+    {
+        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        {
+            return ConvolutionMethod::FFT;
+        }
+        if(input->dimension(idx_c) < 16)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+        return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    }
 }
 
 void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
new file mode 100644
index 0000000..cc39d02
--- /dev/null
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+NECropResize::NECropResize()
+    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+{
+}
+
+Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
+                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+    TensorInfo temp_info;
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+    }
+    return Status{};
+}
+
+void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
+                             InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+    _num_boxes = boxes->info()->tensor_shape()[1];
+    TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+    _output              = output;
+    _method              = method;
+    _extrapolation_value = extrapolation_value;
+
+    // For each crop box:
+    // - A crop kernel is used to extract the initial cropped image as specified by boxes[i] from the 3D image input[box_ind[i]].
+    // - A tensor is required to hold this initial cropped image.
+    // - A scale function is used to resize the cropped image to the size specified by crop_size.
+    // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+    //   that will hold all final cropped and scaled 3D images.
+    _crop.reserve(_num_boxes);
+    _crop_results.reserve(_num_boxes);
+    _scaled_results.reserve(_num_boxes);
+    _scale.reserve(_num_boxes);
+
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        auto       crop_tensor = support::cpp14::make_unique<Tensor>();
+        TensorInfo crop_result_info(1, DataType::F32);
+        crop_result_info.set_data_layout(DataLayout::NHWC);
+        crop_tensor->allocator()->init(crop_result_info);
+
+        auto       scale_tensor = support::cpp14::make_unique<Tensor>();
+        TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+        scaled_result_info.set_data_layout(DataLayout::NHWC);
+        scale_tensor->allocator()->init(scaled_result_info);
+
+        auto crop_kernel  = support::cpp14::make_unique<NECropKernel>();
+        auto scale_kernel = support::cpp14::make_unique<NEScale>();
+        crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value);
+
+        _crop.emplace_back(std::move(crop_kernel));
+        _scaled_results.emplace_back(std::move(scale_tensor));
+        _crop_results.emplace_back(std::move(crop_tensor));
+        _scale.emplace_back(std::move(scale_kernel));
+    }
+}
+
+void NECropResize::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+        // may not be known until run-time and so the kernels cannot be configured until then.
+        _crop[i]->configure_output_shape();
+        _crop_results[i]->allocator()->allocate();
+        NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
+
+        // Scale the cropped image.
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+        _scaled_results[i]->allocator()->allocate();
+        _scale[i]->run();
+
+        // Copy scaled image into output.
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+    }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 44d7197..aff335e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -51,8 +51,8 @@
                                       unsigned int inner_border_right, unsigned int inner_border_top)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
@@ -68,7 +68,11 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    if(bias != nullptr)
+    if(is_data_type_quantized_asymmetric(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
     }
@@ -111,10 +115,11 @@
     _inner_border     = std::make_pair(inner_border_right, inner_border_top);
     _is_prepared      = false;
 
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int stride_x    = info.stride().first;
+    const unsigned int stride_y    = info.stride().second;
 
-    _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped);
 
     auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
@@ -159,12 +164,10 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _upsample_f.run();
     _conv_f.run();
-
-    _memory_group.release();
 }
 
 void NEDeconvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 49db855..8f070a2 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,26 +45,30 @@
 
 void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
 {
-    _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+    _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info;
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
     unsigned int depth_offset = 0;
+    _concat_kernels_vector.reserve(_num_inputs);
+    _border_handlers_vector.reserve(_num_inputs);
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+        auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _border_handlers_vector.emplace_back(std::move(border_kernel));
+        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
@@ -80,7 +84,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int depth_offset = 0;
@@ -98,7 +102,7 @@
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {
-        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
-        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+        NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
+        NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
     }
 }
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f0fd4cf..3bb69b1 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -31,112 +31,79 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
-      _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
-      _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor                   *input,
+                                                       const ITensor             *weights,
+                                                       const ITensor             *biases,
+                                                       ITensor                   *output,
+                                                       const PadStrideInfo       &conv_info,
+                                                       unsigned int               depth_multiplier,
+                                                       const ActivationLayerInfo &act_info,
+                                                       const Size2D              &dilation)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_UNUSED(act_info);
 
     PixelValue zero_value(0.f);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias     = biases != nullptr;
-    _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
-                                                                                          conv_info,
-                                                                                          input->info()->data_type(),
-                                                                                          depth_multiplier,
-                                                                                          input->info()->data_layout());
-    _are_weights_reshaped = false;
-    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
-    _permute              = _is_optimized == _is_nchw;
-
     // Initialize the intermediate accumulator tensor in case of quantized input
     if(_is_quantized)
     {
         TensorShape accum_shape  = output->info()->tensor_shape();
         DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_optimized && !_is_nchw)
+        if(!_is_nchw)
         {
             permute(accum_shape, PermutationVector(1U, 2U, 0U));
             accum_layout = DataLayout::NCHW;
         }
 
+        _memory_group.manage(&_accumulator);
         _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
         _accumulator.info()->set_data_layout(accum_layout);
         zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
 
-    if(_is_optimized)
+    if(!_is_nchw)
     {
-        ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
-        if(_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NCHW -> NHWC
-            _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-            _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
 
-            // Configure the function to transform the weights tensor from IHW -> HWI
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-            _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
+        // Configure depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
 
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-            _permuted_output.allocator()->allocate();
-        }
-        else
-        {
-            _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
-        }
+        // Configure border handler
+        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
     }
     else
     {
-        if(!_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NHWC -> NCHW
-            _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-            _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
 
-            // Configure the function to transform the weights tensor from HWI -> IHW
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-        }
-        else
-        {
-            // Configure depthwise convolution kernel
-            _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-        }
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
     }
 
     // Configure biases accumulation
@@ -145,37 +112,138 @@
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
         _accumulator.allocator()->allocate();
     }
     else if(_has_bias)
     {
-        _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
     }
 
-    if(!_is_optimized && !_is_nchw)
+    // Permute output
+    if(!_is_nchw)
     {
         // Configure the function to transform the convoluted output to NHWC
         _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
         _permuted_output.allocator()->allocate();
     }
+}
 
-    //Configure Activation Layer
+void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor             *input,
+                                                         const ITensor             *weights,
+                                                         const ITensor             *biases,
+                                                         ITensor                   *output,
+                                                         const PadStrideInfo       &conv_info,
+                                                         unsigned int               depth_multiplier,
+                                                         const ActivationLayerInfo &act_info)
+{
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
+    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = act_info;
+    }
+
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure optimized depthwise
+        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
+                                               const ITensor *weights,
+                                               const ITensor *biases,
+                                               ITensor *output, const PadStrideInfo &conv_info,
+                                               unsigned int               depth_multiplier,
+                                               const ActivationLayerInfo &act_info,
+                                               const Size2D              &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias         = biases != nullptr;
+    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+                                                                                       weights->info(),
+                                                                                       conv_info,
+                                                                                       depth_multiplier, dilation);
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_optimized == _is_nchw;
+    _is_prepared                = false;
     _is_activationlayer_enabled = act_info.enabled();
 
+    // Configure appropriate pipeline
+    if(_is_optimized)
+    {
+        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    }
+    else
+    {
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+
+    // Configure activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
 }
 
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input,
+                                                const ITensorInfo         *weights,
+                                                const ITensorInfo         *biases,
+                                                const ITensorInfo         *output,
+                                                const PadStrideInfo       &conv_info,
+                                                unsigned int               depth_multiplier,
+                                                const ActivationLayerInfo &act_info,
+                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     if(biases != nullptr)
     {
@@ -184,14 +252,20 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
-    if(is_quantized)
+    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
     }
 
     //Validate Activation Layer
@@ -203,43 +277,14 @@
     return Status{};
 }
 
-void NEDepthwiseConvolutionLayer3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run_generic()
 {
-    if(_is_first_run && _is_optimized)
-    {
-        _is_first_run = false;
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
-    }
-
-    // Permute weights
-    if(_permute)
-    {
-        if(!_are_weights_reshaped)
-        {
-            _are_weights_reshaped = true;
-            _permute_weights.run();
-        }
-
-        _permute_input.run();
-    }
-
-    // Handle input
-    if(!_is_optimized)
-    {
-        // Fill border
-        NEScheduler::get().schedule(&_border_handler, Window::DimX);
-    }
+    // Fill border
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
 
     // Execute depthwise convolution
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
-    // Permute output
-    if(_is_optimized && _is_nchw)
-    {
-        _permute_output.run();
-    }
-
     // Add biases
     if(_has_bias || _is_quantized)
     {
@@ -247,17 +292,71 @@
     }
 
     // Permute output
-    if(!_is_optimized && !_is_nchw)
+    if(!_is_nchw)
     {
         _permute_output.run();
     }
+}
 
+void NEDepthwiseConvolutionLayer3x3::run_optimized()
+{
+    // Run assembly function
+    _dwc_optimized_func.run();
+
+    // Permute output
+    if(_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Permute input
+    if(_permute)
+    {
+        _permute_input.run();
+    }
+
+    _is_optimized ? run_optimized() : run_generic();
+
+    // Run activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
 }
 
+void NEDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        if(_permute)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights.run();
+            _original_weights->mark_as_unused();
+        }
+
+        // Prepare optimized function
+        if(_is_optimized)
+        {
+            _dwc_optimized_func.prepare();
+            if(!_permuted_weights.is_used())
+            {
+                _permuted_weights.allocator()->free();
+            }
+        }
+
+        _is_prepared = true;
+    }
+}
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
       _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
@@ -266,14 +365,21 @@
 }
 
 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
-
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -304,7 +410,7 @@
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -332,7 +438,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -356,7 +462,8 @@
         const QuantizationInfo output_quant_info = output->info()->quantization_info();
 
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
         _output_reshaped.allocator()->allocate();
@@ -399,14 +506,17 @@
 }
 
 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
 
     const unsigned int width_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
     // Clone output to use auto init
     auto output_clone = output->clone();
 
@@ -433,7 +543,7 @@
 
     const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
     const size_t       weights_w    = weights_to_use->dimension(0);
     const size_t       weights_h    = weights_to_use->dimension(1);
     const size_t       weights_z    = weights_to_use->dimension(2);
@@ -460,7 +570,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -542,3 +652,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 0627977..e92b4bf 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,34 +24,20 @@
 
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-NEDequantizationLayer::NEDequantizationLayer()
-    : _dequantize_kernel()
+namespace arm_compute
 {
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
-Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
-
-    return Status{};
+    return NEDequantizationLayerKernel::validate(input, output);
 }
-
-void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
-    // Configure kernel
-    _dequantize_kernel.configure(input, output, min_max);
-}
-
-void NEDequantizationLayer::run()
-{
-    NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
-}
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 40e40c8..322bb2c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,7 @@
 {
     NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_conv_kernel, _dim_split);
     if(_has_bias)
@@ -117,5 +117,4 @@
     {
         _activationlayer_function.run();
     }
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
new file mode 100644
index 0000000..25ba1c8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+{
+}
+
+void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+
+    // Decompose size to radix factors
+    const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->info()->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+    // Flags
+    _run_scale = config.direction == FFTDirection::Inverse;
+
+    const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+    // Configure digit reverse
+    FFTDigitReverseKernelInfo digit_reverse_config;
+    digit_reverse_config.axis      = config.axis;
+    digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+    TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+    _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+    _memory_group.manage(&_digit_reversed_input);
+    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+    // Create and configure FFT kernels
+    unsigned int Nx = 1;
+    _num_ffts       = decomposed_vector.size();
+    _fft_kernels.resize(_num_ffts);
+    _axis = config.axis;
+
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+        FFTRadixStageKernelInfo fft_kernel_info;
+        fft_kernel_info.axis           = config.axis;
+        fft_kernel_info.radix          = radix_for_stage;
+        fft_kernel_info.Nx             = Nx;
+        fft_kernel_info.is_first_stage = (i == 0);
+        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+        Nx *= radix_for_stage;
+    }
+
+    // Configure scale kernel
+    if(_run_scale)
+    {
+        FFTScaleKernelInfo scale_config;
+        scale_config.scale     = static_cast<float>(N);
+        scale_config.conjugate = config.direction == FFTDirection::Inverse;
+        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+    }
+
+    // Allocate tensors
+    _digit_reversed_input.allocator()->allocate();
+    _digit_reverse_indices.allocator()->allocate();
+
+    // Init digit reverse indices
+    const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+    std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+}
+
+Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+    // Check if FFT is decomposable
+    const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        // All combinations are supported except real input with real output (i.e., both input channels set to 1)
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void NEFFT1D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+    }
+
+    // Run output scaling
+    if(_run_scale)
+    {
+        NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
new file mode 100644
index 0000000..9210ecf
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+
+    // Setup first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    _memory_group.manage(&_first_pass_tensor);
+    _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+    // Setup second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+    _first_pass_tensor.allocator()->allocate();
+}
+
+Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    // Create intermediate tensor info
+    TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+    // Validate first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+    // Validate second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void NEFFT2D::run()
+{
+    _memory_group.acquire();
+
+    _first_pass_func.run();
+    _second_pass_func.run();
+
+    _memory_group.release();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..0823007
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+    const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+
+    int  pad           = 0;
+    bool is_decomposed = false;
+    while(!is_decomposed)
+    {
+        const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+        is_decomposed                = !decomposed_vector.empty();
+        if(!is_decomposed)
+        {
+            ++pad;
+        }
+    }
+    return pad;
+}
+} // namespace
+
+NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager),
+      _flip_weights_func(),
+      _permute_input_func(),
+      _permute_output_func(),
+      _permute_weights_func(),
+      _permute_bias_func(),
+      _pad_input_func(),
+      _pad_weights_func(),
+      _transform_input_func(memory_manager),
+      _transform_weights_func(),
+      _itransform_output_func(memory_manager),
+      _prod_func(),
+      _reduce_func(),
+      _extract_output_func(),
+      _bias_add_func(),
+      _activation_layer_func(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_bias(),
+      _permuted_output(),
+      _padded_input(),
+      _padded_weights(),
+      _flip_axis(),
+      _flipped_weights(),
+      _transformed_input(),
+      _transformed_weights(),
+      _input_weights_product(),
+      _output_product(),
+      _output_reduced(),
+      _itransformed_output(),
+      _reshaped_output(),
+      _bias_output(),
+      _original_weights(nullptr),
+      _original_bias(nullptr),
+      _is_activationlayer_enabled(false),
+      _needs_permute(false),
+      _has_bias(false),
+      _is_prepared(false)
+{
+}
+
+void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                      const ActivationLayerInfo &act_info)
+{
+    _original_weights = weights;
+    _original_bias    = biases;
+
+    // Flat if bias addition is required
+    _has_bias = biases != nullptr;
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    // Tensors to use
+    ITensor       *input_to_use   = input;
+    const ITensor *weights_to_use = weights;
+    ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
+
+    // Permute bias
+    if(biases != nullptr)
+    {
+        _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+        _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+    }
+
+    // Permute input if needed
+    _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+    if(_needs_permute)
+    {
+        _memory_group.manage(&_permuted_input);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Flip weights
+    _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+    // Pad weights
+    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+    // Transform weights
+    _transform_weights_func = support::cpp14::make_unique<NEFFT2D>();
+    _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+    // Pad input
+    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    _memory_group.manage(&_padded_input);
+    _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+    if(_needs_permute)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    // Transform input
+    _memory_group.manage(&_transformed_input);
+    _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+    _padded_input.allocator()->allocate();
+
+    // Perform product
+    _memory_group.manage(&_output_product);
+    _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+    _transformed_input.allocator()->allocate();
+
+    // Perform reduction
+    _memory_group.manage(&_output_reduced);
+    _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+    _output_product.allocator()->allocate();
+
+    // Transform output
+    _memory_group.manage(&_itransformed_output);
+    FFT2DInfo itranform_info;
+    itranform_info.direction = FFTDirection::Inverse;
+    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+    _output_reduced.allocator()->allocate();
+
+    // Reshape output
+    TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+    reshaped_shape.remove_dimension(2);
+    _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+    // Extract correct region
+    const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+    const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
+    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if(_has_bias)
+    {
+        _memory_group.manage(&_bias_output);
+    }
+    else if(_needs_permute)
+    {
+        output_to_use = &_permuted_output;
+        _memory_group.manage(&_permuted_output);
+    }
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _reshaped_output.allocator()->allocate();
+    _itransformed_output.allocator()->allocate();
+
+    // Add bias
+    if(biases != nullptr)
+    {
+        output_to_use = output;
+        if(_needs_permute)
+        {
+            output_to_use = &_permuted_output;
+            _memory_group.manage(&_permuted_output);
+        }
+        auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+        _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+        _bias_output.allocator()->allocate();
+    }
+
+    // Permute output
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_output.allocator()->allocate();
+    }
+
+    // Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.configure(output, nullptr, act_info);
+    }
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    axis_data[0]   = 0;
+    axis_data[1]   = 1;
+}
+
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+    // Strides
+    const auto strides = conv_info.stride();
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+
+    return Status{};
+}
+
+void NEFFTConvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Transform input
+    if(_needs_permute)
+    {
+        _permute_input_func.run();
+    }
+    _pad_input_func.run();
+    _transform_input_func.run();
+
+    // Perform operations to frequency domain
+    _prod_func.run();
+
+    _reduce_func.run();
+
+    // Transform output
+    _itransform_output_func.run();
+    _reshaped_output.allocator()->import_memory(_itransformed_output.buffer());
+    _extract_output_func.run();
+
+    // Add bias
+    if(_has_bias)
+    {
+        _bias_add_func.run();
+    }
+    if(_needs_permute)
+    {
+        _permute_output_func.run();
+    }
+
+    // Run activation layer
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.run();
+    }
+}
+
+void NEFFTConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute bias to NCHW
+        if(_original_bias != nullptr)
+        {
+            _permuted_bias.allocator()->allocate();
+            _permute_bias_func.run();
+            _original_bias->mark_as_unused();
+        }
+
+        const ITensor *cur_weights = _original_weights;
+
+        // Permute weights
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_func.run();
+            cur_weights->mark_as_unused();
+            cur_weights = &_permuted_weights;
+        }
+
+        // Flip weights
+        _flipped_weights.allocator()->allocate();
+        _flip_weights_func.run();
+        cur_weights->mark_as_unused();
+
+        // Pad weights
+        _padded_weights.allocator()->allocate();
+        _pad_weights_func.run();
+        _flipped_weights.mark_as_unused();
+        _flipped_weights.allocator()->free();
+
+        // Transform weights to frequency domain
+        _transformed_weights.allocator()->allocate();
+        _transform_weights_func->run();
+        _transform_weights_func.reset();
+
+        _padded_weights.mark_as_unused();
+        _padded_weights.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 4137b1d..af35301 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
@@ -103,6 +103,4 @@
     }
 
     NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 45e21b5..e1a17db 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -228,7 +228,8 @@
     if(_is_quantized)
     {
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
             NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
         }
     }
-
-    _memory_group.release();
 }
 
 void NEFullyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 914f088..55bcc45 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -238,16 +238,14 @@
 {
     prepare();
 
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
     if(_asm_glue.is_configured())
     {
-        _memory_group.acquire();
         _asm_glue.run();
-        _memory_group.release();
     }
     else
     {
-        _memory_group.acquire();
-
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
@@ -262,8 +260,6 @@
 
         NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
-        _memory_group.release();
-
         // Run matrix addition kernel
         if(_run_addition)
         {
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 470e922..55e067f 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -35,7 +35,7 @@
 {
 namespace
 {
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
                                                      const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
                                                      std::shared_ptr<IMemoryManager> memory_manager)
 
@@ -375,7 +375,7 @@
 
 void NEGEMMAssemblyDispatch::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     if(_function != nullptr)
     {
         _function->run();
@@ -385,6 +385,5 @@
         ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
         _arm_gemm->run();
     }
-    _memory_group.release();
 }
 } //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index be7cc2d..a2c4e8a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,16 +90,17 @@
 }
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
-      _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+      _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
+      _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
+                                           _skip_im2col));
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -114,7 +115,41 @@
         input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
-        _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quantization_info : output->info()->quantization_info();
+
+        float multiplier = input_quantization_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            _is_activationlayer_enabled = false;
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset     = output_quant_info.offset;
+        output_info.gemmlowp_multiplier = output_multiplier;
+        output_info.gemmlowp_shift      = output_shift;
+        output_info.gemmlowp_min_bound  = min_activation;
+        output_info.gemmlowp_max_bound  = max_activation;
+
+        _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -127,9 +162,11 @@
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
+                                           int gemm_3d_depth, bool skip_im2col)
 {
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_quantized          = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_activation_enabled = act_info.enabled();
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -145,8 +182,40 @@
         input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
+        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quantization_info : output->quantization_info();
+
+        float multiplier = input_quantization_info.scale * weights->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset     = output_quant_info.offset;
+        output_info.gemmlowp_multiplier = output_multiplier;
+        output_info.gemmlowp_shift      = output_shift;
+        output_info.gemmlowp_min_bound  = min_activation;
+        output_info.gemmlowp_max_bound  = max_activation;
+
         // Perform validation step on GEMMLowp
-        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
+        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
     }
     else
     {
@@ -155,19 +224,18 @@
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
 {
-    const bool         is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const DataType     output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
-    const unsigned int mult_y                = skip_im2col ? 1U : gemm_3d_depth;
-    const unsigned int mult_z                = skip_im2col ? gemm_3d_depth : 1U;
+    const DataType     data_type = input_info->data_type();
+    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
+    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
 
     // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
     const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
 }
 
 void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
@@ -202,9 +270,8 @@
     _append_bias                = (biases != nullptr) && (!_is_quantized);
     _is_activationlayer_enabled = act_info.enabled();
 
-    const ITensor *gemm_input_to_use         = input;
-    ITensor       *gemm_output_to_use        = output;
-    ITensor       *gemm_output_staged_to_use = output;
+    const ITensor *gemm_input_to_use  = input;
+    ITensor       *gemm_output_to_use = output;
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -219,7 +286,7 @@
     // Check if GEMM3D is supported
     if(data_layout == DataLayout::NHWC)
     {
-        _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
+        _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!_skip_col2im)
         {
@@ -262,26 +329,17 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
-    if(!_skip_col2im || _is_quantized)
+    if(!_skip_col2im)
     {
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-        const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
-        TensorShape    shape_gemm;
+        TensorShape shape_gemm;
 
-        if(_is_quantized && _skip_col2im)
-        {
-            shape_gemm = output->info()->tensor_shape();
-        }
-        else
-        {
-            // Calculate GEMM output shape
-            shape_gemm = _im2col_output.info()->tensor_shape();
-            shape_gemm.set(0, mat_weights_cols);
-            shape_gemm.set(1, conv_w * conv_h);
-        }
+        // Calculate GEMM output shape
+        shape_gemm = _im2col_output.info()->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
 
         // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+        TensorInfo info_gemm(shape_gemm, 1, data_type);
         info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
         _memory_group.manage(&_gemm_output);
@@ -293,62 +351,24 @@
     // Configure GEMM
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
         _im2col_output.allocator()->allocate();
     }
 
-    // Configure output stage for quantized case
-    if(_is_quantized)
-    {
-        const QuantizationInfo input_quant_info  = input->info()->quantization_info();
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
-
-        float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        if(!_skip_col2im)
-        {
-            _memory_group.manage(&_tmp_output);
-            gemm_output_staged_to_use = &_tmp_output;
-        }
-
-        // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            _is_activationlayer_enabled = false;
-        }
-
-        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
-    }
-
     if(!_skip_col2im)
     {
         if(_data_layout == DataLayout::NCHW)
         {
             // Configure col2im
-            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
+            _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
         }
         else
         {
             // Configure reshape layer
-            _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
+            _reshape_layer.configure(gemm_output_to_use, output);
         }
     }
 
@@ -394,11 +414,13 @@
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
 
-    TensorInfo         im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
-    const ITensorInfo *gemm_input_to_use         = input;
-    const ITensorInfo *gemm_output_to_use        = output;
-    const ITensorInfo *gemm_output_staged_to_use = output;
-    const ITensorInfo *weights_to_use            = weights;
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         tmp_info{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = input;
+    const ITensorInfo *gemm_output_to_use = output;
+    const ITensorInfo *weights_to_use     = weights;
 
     const bool is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool append_bias           = (biases != nullptr) && (!is_quantized);
@@ -420,7 +442,7 @@
     bool skip_col2im = false;
     if(data_layout == DataLayout::NHWC)
     {
-        skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+        skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!skip_col2im)
         {
@@ -431,7 +453,7 @@
     if(skip_col2im)
     {
         // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+        if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
         {
             skip_im2col = false;
             skip_col2im = false;
@@ -495,68 +517,25 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
     if(!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
-        info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+        info_gemm = TensorInfo(shape_gemm, 1, data_type);
     }
     else
     {
-        info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+        info_gemm = TensorInfo(output->tensor_shape(), 1, data_type);
     }
     info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
     gemm_output_to_use = &info_gemm;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
-
-    if(is_quantized)
-    {
-        const QuantizationInfo input_quant_info  = input->quantization_info();
-        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
-        const float            multiplier        = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
-        int                    output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        if(!skip_col2im)
-        {
-            tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
-            tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
-            gemm_output_staged_to_use = &tmp_info;
-        }
-
-        // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            is_activation_enabled = false;
-        }
-
-        // Validate output stage for quantized case
-        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
 
     // Validate Col2Im/ReshapeLayer
     if(!skip_col2im && (data_layout == DataLayout::NCHW))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
-                                                             output,
-                                                             Size2D(conv_w, conv_h)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
     }
 
     //Validate Activation Layer
@@ -572,7 +551,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(!_skip_im2col)
     {
@@ -586,9 +565,6 @@
     {
         // Run gemmlowp
         _mm_gemmlowp.run();
-
-        // Run output stage
-        _gemmlowp_output_stage.run();
     }
     else
     {
@@ -618,8 +594,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 void NEGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 47c3358..ede89bf 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,5 @@
-/* Copyright (c) 2017-2018 ARM Limited.
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -116,7 +117,7 @@
 
 void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     if(_mtx_a_reshape_kernel)
     {
         NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
@@ -135,6 +136,4 @@
     {
         NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 5286f11..54f49a6 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
-      _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
 {
 }
 
@@ -53,6 +53,9 @@
     ARM_COMPUTE_UNUSED(c);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
 
+    const ITensor *matrix_a = a;
+    const ITensor *matrix_b = b;
+
     // Clear state
     _mtx_a_reshape_kernel = nullptr;
     _mtx_b_reshape_kernel = nullptr;
@@ -65,6 +68,18 @@
     _is_prepared                      = false;
     _original_b                       = b;
 
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        _fuse_output_stage = true;
+
+        _memory_group.manage(&_mm_result_s32);
+
+        TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+
+        _mm_result_s32.allocator()->init(info_mm_result_s32);
+    }
+
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
@@ -72,7 +87,7 @@
         case DataType::U8:
         case DataType::S8:
         {
-            _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+            _asm_glue.configure(a, b, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, _reshape_b_only_on_first_run);
             _dot_product_path = _asm_glue.is_configured();
             break;
         }
@@ -83,51 +98,35 @@
         }
     }
 #endif /* __aarch64__ */
-    if(!_dot_product_path)
+    if(!(_dot_product_path || _run_vector_matrix_multiplication))
     {
-        if(_run_vector_matrix_multiplication)
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
+        _tmp_a.allocator()->init(a_info);
+        _tmp_b.allocator()->init(b_info);
+        _memory_group.manage(&_tmp_a);
+        if(!_reshape_b_only_on_first_run)
         {
-            // Configure matrix multiply kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                k->configure(a, b, output);
-                _mm_kernel = std::move(k);
-            }
+            _memory_group.manage(&_tmp_b);
         }
-        else
+
+        // Configure interleave kernel
         {
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
-            _tmp_a.allocator()->init(info_a);
-            _tmp_b.allocator()->init(info_b);
-            _memory_group.manage(&_tmp_a);
-            if(!_reshape_b_only_on_first_run)
-            {
-                _memory_group.manage(&_tmp_b);
-            }
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            k->configure(a, &_tmp_a);
+            _mtx_a_reshape_kernel = std::move(k);
+        }
 
-            // Configure interleave kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-                k->configure(a, &_tmp_a);
-                _mtx_a_reshape_kernel = std::move(k);
-            }
-
-            // Configure transpose kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-                k->configure(b, &_tmp_b);
-                _mtx_b_reshape_kernel = std::move(k);
-            }
-
-            // Configure matrix multiply kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                k->configure(&_tmp_a, &_tmp_b, output);
-                _mm_kernel = std::move(k);
-            }
+        // Configure transpose kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            k->configure(b, &_tmp_b);
+            _mtx_b_reshape_kernel = std::move(k);
         }
     }
 
@@ -158,8 +157,33 @@
         _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
     }
 
-    // Configure offset contribution kernel
-    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+    if(_fuse_output_stage)
+    {
+        // Configure matrix multiply kernel
+        if(!_dot_product_path)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(matrix_a, matrix_b, &_mm_result_s32);
+            _mm_kernel = std::move(k);
+        }
+
+        _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+                                                           _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+        _mm_result_s32.allocator()->allocate();
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        if(!_dot_product_path)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(matrix_a, matrix_b, output);
+            _mm_kernel = std::move(k);
+        }
+        // Configure offset contribution kernel
+        _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+    }
 
     // Allocate tensors
     if(!_dot_product_path && !_run_vector_matrix_multiplication)
@@ -185,43 +209,53 @@
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+    TensorInfo mm_result_s32_info{};
+
     int32_t    a_offset                    = a->quantization_info().offset;
     int32_t    b_offset                    = b->quantization_info().offset;
     const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
+    bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+    if(fuse_output_stage)
+    {
+        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+    }
+
     // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, reshape_b_only_on_first_run));
 
     if(run_optimised)
     {
-        if(output->total_size() != 0)
+        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+        if(gemm_info.depth_output_gemm3d() != 0)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-            if(gemm_info.depth_output_gemm3d() != 0)
+            if(gemm_info.reinterpret_input_as_3d())
             {
-                if(gemm_info.reinterpret_input_as_3d())
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-                }
-                else
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-                }
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
             }
         }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        }
     }
     else
     {
@@ -231,6 +265,9 @@
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         if(!run_vector_matrix_multiplication)
         {
+            matrix_a_info = &tmp_a_info;
+            matrix_b_info = &tmp_b_info;
+
             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
             TensorShape shape_tmp_a = a->tensor_shape();
             shape_tmp_a.set(0, a->dimension(0) * 4);
@@ -241,20 +278,17 @@
             shape_tmp_b.set(0, b->dimension(1) * 16);
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
-            TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
-            TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
 
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
         }
     }
 
-    TensorInfo info_vector_sum_col, info_vector_sum_row;
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
 
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
@@ -274,12 +308,32 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
     }
 
-    // Validate offset contribution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
-                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                             a_offset, b_offset));
+    if(fuse_output_stage)
+    {
+        if(!run_optimised)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+        }
 
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+                                                                                            a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                            b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                            c, output, a_offset, b_offset,
+                                                                                            gemm_info.gemmlowp_output_stage()));
+    }
+    else
+    {
+        if(!run_optimised)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+        }
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                 a_offset, b_offset));
+    }
     return Status{};
 }
 
@@ -287,7 +341,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reshape inputs
     if(_mtx_a_reshape_kernel)
@@ -321,10 +375,16 @@
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
 
-    // Run offset contribution kernel
-    NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-
-    _memory_group.release();
+    if(_fuse_output_stage)
+    {
+        // Run offset contribution kernel
+        NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+    }
+    else
+    {
+        // Run offset contribution kernel
+        NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+    }
 }
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index b010ca0..3c7411e 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,10 +59,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 8a85bba..0dbcb12 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,6 @@
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction        = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
-
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
         tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
@@ -86,19 +81,33 @@
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
         _tmp.init(pyramid_info);
 
+        _horizontal_reduction.reserve(num_levels);
+        _vertical_reduction.reserve(num_levels);
+        _horizontal_border_handler.reserve(num_levels);
+        _vertical_border_handler.reserve(num_levels);
+
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            auto horizontal_kernel = support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+            horizontal_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            auto vertical_kernel = support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+            vertical_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            auto horizontal_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+            horizontal_border_kernel->configure(_pyramid->get_pyramid_level(i), horizontal_kernel->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            auto vertical_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+            vertical_border_kernel->configure(_tmp.get_pyramid_level(i), vertical_kernel->border_size(), border_mode, PixelValue(pixel_value_u16));
+
+            _vertical_border_handler.emplace_back(std::move(vertical_border_kernel));
+            _horizontal_border_handler.emplace_back(std::move(horizontal_border_kernel));
+            _vertical_reduction.emplace_back(std::move(vertical_kernel));
+            _horizontal_reduction.emplace_back(std::move(horizontal_kernel));
         }
 
         _tmp.allocate();
@@ -117,10 +126,10 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
-        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
-        NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
-        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
     }
 }
 
@@ -147,19 +156,20 @@
 
     if(num_levels > 1)
     {
-        _gaus5x5       = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
-
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
         _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure gaussian 5x5 */
-            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+            auto gaus5x5_kernel = support::cpp14::make_unique<NEGaussian5x5>();
+            gaus5x5_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+            _gaus5x5.emplace_back(std::move(gaus5x5_kernel));
 
             /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+            auto scale_kernel = support::cpp14::make_unique<NEScale>();
+            scale_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+            _scale_nearest.emplace_back(std::move(scale_kernel));
         }
 
         _tmp.allocate();
@@ -178,7 +188,7 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        _gaus5x5[i].run();
-        _scale_nearest[i].run();
+        _gaus5x5[i].get()->run();
+        _scale_nearest[i].get()->run();
     }
 }
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 5e98269..8efc091 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,7 +95,7 @@
 
 void NEHOGDescriptor::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run gradient
     _gradient.run();
@@ -105,6 +105,4 @@
 
     // Run block normalization kernel
     NEScheduler::get().schedule(&_block_norm, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index efc8690..90785fe 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,13 +80,11 @@
 
 void NEHOGGradient::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 8c834e2..26abc9d 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,12 +126,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+    _block_norm_kernel.reserve(_num_block_norm_kernel);
+    _hog_detect_kernel.reserve(_num_hog_detect_kernel);
+    _hog_space.reserve(_num_orient_bin_kernel);
+    _hog_norm_space.reserve(_num_block_norm_kernel);
     _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -167,13 +167,17 @@
 
         // Allocate HOG space
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
+        auto       hog_space_tensor = support::cpp14::make_unique<Tensor>();
+        hog_space_tensor->allocator()->init(info_space);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_space.get() + i);
+        _memory_group.manage(hog_space_tensor.get());
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        auto orient_bin_kernel = support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+        orient_bin_kernel->configure(&_mag, &_phase, hog_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel.emplace_back(std::move(orient_bin_kernel));
+        _hog_space.emplace_back(std::move(hog_space_tensor));
     }
 
     // Allocate intermediate tensors
@@ -188,19 +192,23 @@
 
         // Allocate normalized HOG space
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
+        auto       hog_norm_space_tensor = support::cpp14::make_unique<Tensor>();
+        hog_norm_space_tensor->allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_norm_space.get() + i);
+        _memory_group.manage(hog_norm_space_tensor.get());
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        auto block_norm_kernel = support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+        block_norm_kernel->configure(_hog_space[idx_orient_bin].get(), hog_norm_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel.emplace_back(std::move(block_norm_kernel));
+        _hog_norm_space.emplace_back(std::move(hog_norm_space_tensor));
     }
 
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        _hog_space[i].allocator()->allocate();
+        _hog_space[i].get()->allocator()->allocate();
     }
 
     // Configure HOG detector kernel
@@ -208,7 +216,9 @@
     {
         const size_t idx_block_norm = input_hog_detect[i];
 
-        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        auto hog_detect_kernel = support::cpp14::make_unique<NEHOGDetector>();
+        hog_detect_kernel->configure(_hog_norm_space[idx_block_norm].get(), multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        _hog_detect_kernel.emplace_back(std::move(hog_detect_kernel));
     }
 
     // Configure non maxima suppression kernel
@@ -217,7 +227,7 @@
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        _hog_norm_space[i].allocator()->allocate();
+        _hog_norm_space[i]->allocator()->allocate();
     }
 }
 
@@ -225,7 +235,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reset detection window
     _detection_windows->clear();
@@ -234,21 +244,21 @@
     _gradient_kernel.run();
 
     // Run orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    for(auto &kernel : _orient_bin_kernel)
     {
-        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+        NEScheduler::get().schedule(kernel.get(), Window::DimY);
     }
 
     // Run block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    for(auto &kernel : _block_norm_kernel)
     {
-        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+        NEScheduler::get().schedule(kernel.get(), Window::DimY);
     }
 
     // Run HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    for(auto &kernel : _hog_detect_kernel)
     {
-        _hog_detect_kernel[i].run();
+        kernel->run();
     }
 
     // Run non-maxima suppression kernel if enabled
@@ -256,6 +266,4 @@
     {
         NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index db5e926..3eadbee 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,7 @@
     _score.allocator()->init(tensor_info_score);
     _nonmax.allocator()->init(tensor_info_score);
 
-    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list.resize(shape.x() * shape.y());
 
     // Set/init Sobel kernel accordingly with gradient_size
     switch(gradient_size)
@@ -171,20 +171,20 @@
     _score.allocator()->allocate();
 
     // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
 
     // Allocate once all the configure methods have been called
     _nonmax.allocator()->allocate();
 
     // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+    _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist);
 }
 
 void NEHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
@@ -207,6 +207,4 @@
 
     // Run sort & euclidean distance
     NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index f333ecb..d56bd7c 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEHistogram::NEHistogram()
-    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+    : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
 {
 }
 
@@ -45,10 +45,10 @@
 
     // Allocate space for threads local histograms
     _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
-    _local_hist      = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+    _local_hist.resize(_local_hist_size);
 
     // Configure kernel
-    _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+    _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
 }
 
 void NEHistogram::run()
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 56da966..c9ab5c9 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,11 +68,9 @@
 
 void NEL2NormalizeLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
     NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 9e7a713..3d3c6a1 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -43,10 +43,10 @@
       _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
       _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
       _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
-      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
-      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
-      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
-      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+      _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+      _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(),
+      _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false)
 {
 }
 
@@ -96,22 +96,32 @@
 
     // Configure block that calculates the forget gate
     // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
-    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    _memory_group.manage(&_forget_gate_out1);
-    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+    std::vector<const ITensor *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+
     _memory_group.manage(&_forget_gate_out2);
-    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
-    _memory_group.manage(&_forget_gate_out3);
-    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
-    _forget_gate_out2.allocator()->allocate();
+    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+
+    std::vector<const ITensor *> weights_vector;
+
+    weights_vector.emplace_back(input_to_forget_weights);
+    weights_vector.emplace_back(recurrent_to_forget_weights);
+
+    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
-    _forget_gate_out1.allocator()->allocate();
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+    _memory_group.manage(&_forget_gate_out1);
+    _memory_group.manage(&_forget_gate_out3);
+    _forget_gate_out6.allocator()->allocate();
+
     Tensor *forget_gate_out = &_forget_gate_out5;
     if(lstm_params.has_peephole_opt())
     {
@@ -134,6 +144,8 @@
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
+    // We optimize this as follows:
+    // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
@@ -146,31 +158,29 @@
     }
     else
     {
-        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
-        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        std::vector<const ITensor *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+
+        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
 
         _memory_group.manage(&_input_gate_out1);
-        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
-        _memory_group.manage(&_input_gate_out2);
-        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
-        _memory_group.manage(&_input_gate_out3);
-        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
-        _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
-        _input_gate_out3.allocator()->allocate();
-        input_gate_out = &_input_gate_out4;
+
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+        _input_gate_out2.allocator()->allocate();
+        input_gate_out = &_input_gate_out3;
+
         if(_run_peephole_opt)
         {
-            _memory_group.manage(&_input_gate_out5);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _memory_group.manage(&_input_gate_out4);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
-            _input_gate_out5.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
         }
         else
@@ -215,35 +225,37 @@
 
     // Configure block that calculates the output
     // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
-    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
-    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
+    std::vector<const ITensor *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+
+    _concat_weights_output.configure(in_out_weights, &_output2);
     _memory_group.manage(&_output1);
-    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
-    _memory_group.manage(&_output2);
-    _transpose_output.configure(recurrent_to_output_weights, &_output2);
-    _memory_group.manage(&_output3);
-    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _memory_group.manage(&_output4);
+
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
     _output2.allocator()->allocate();
-    _memory_group.manage(&_output5);
-    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
-    _output3.allocator()->allocate();
-    Tensor *output_gate_out = &_output5;
+    _forget_gate_out2.allocator()->allocate();
+
+    Tensor *output_gate_out = &_output4;
     if(lstm_params.has_peephole_opt())
     {
-        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+        _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
-        _memory_group.manage(&_output4);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
-        _output5.allocator()->allocate();
+        _memory_group.manage(&_output3);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
         // Allocate intermediate buffers
-        _output4.allocator()->allocate();
+        _output3.allocator()->allocate();
     }
     else
     {
@@ -368,10 +380,15 @@
     TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
+    std::vector<const ITensorInfo *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    TensorInfo forget_gate_concat;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
@@ -389,9 +406,13 @@
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
 
+        std::vector<const ITensorInfo *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorInfo lstm_gate_concat;
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
         if(lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -421,9 +442,14 @@
     }
 
     // Validate output gate tmp
+    std::vector<const ITensorInfo *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorInfo in_out_gate_concat;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -465,12 +491,12 @@
 
 void NELSTMLayer::run()
 {
-    _memory_group.acquire();
+    prepare();
 
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
-    NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
-    _gemm_forget_gate.run();
-    NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
 
     if(_run_peephole_opt)
     {
@@ -494,9 +520,7 @@
     else
     {
         _fully_connected_input_gate.run();
-        NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
-        _gemm_input_gate.run();
-        NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+
         if(_run_peephole_opt)
         {
             NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
@@ -520,10 +544,6 @@
     }
 
     _fully_connected_output.run();
-    NEScheduler::get().schedule(&_transpose_output, Window::DimY);
-    _gemm_output.run();
-    NEScheduler::get().schedule(&_accum_output1, Window::DimY);
-
     if(_run_peephole_opt)
     {
         NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
@@ -547,6 +567,18 @@
     NEScheduler::get().schedule(&_copy_output, Window::DimY);
 
     _concat_scratch_buffer.run();
+}
 
-    _memory_group.release();
-}
\ No newline at end of file
+void NELSTMLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _concat_weights_forget_gate.run();
+        if(!_run_cifg_opt)
+        {
+            _concat_weights_input_gate.run();
+        }
+        _concat_weights_output.run();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 0e149d4..5174a13 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,8 +92,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+    _convf.resize(_num_levels);
+    _subf.resize(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 9ad9689..b2d889b 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,8 +64,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
+    _addf.resize(num_levels);
+    _scalef.resize(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
@@ -86,7 +86,7 @@
 
 void NELaplacianReconstruct::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
 
     const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
 
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 80a2541..d08202d 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -168,7 +168,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
@@ -178,8 +178,6 @@
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 void NELocallyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f00114f..d52e928 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,11 +69,9 @@
 
 void NENormalizationLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
     NEScheduler::get().schedule(&_border_handler, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index e90d8f6..0df01c6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,10 +74,10 @@
 
     const float pyr_scale = old_pyramid->info()->scale();
 
-    _func_scharr    = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
-    _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
-    _scharr_gx      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
-    _scharr_gy      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+    _func_scharr.reserve(_num_levels);
+    _kernel_tracker.reserve(_num_levels);
+    _scharr_gx.reserve(_num_levels);
+    _scharr_gy.reserve(_num_levels);
 
     _old_points_internal = LKInternalKeypointArray(old_points->num_values());
     _new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -95,25 +95,34 @@
 
         TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
 
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
+        auto scharr_gx = support::cpp14::make_unique<Tensor>();
+        auto scharr_gy = support::cpp14::make_unique<Tensor>();
+        scharr_gx->allocator()->init(tensor_info);
+        scharr_gy->allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_scharr_gx.get() + i);
-        _memory_group.manage(_scharr_gy.get() + i);
+        _memory_group.manage(scharr_gx.get());
+        _memory_group.manage(scharr_gy.get());
 
         // Init Scharr kernel
-        _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+        auto func_scharr = support::cpp14::make_unique<NEScharr3x3>();
+        func_scharr->configure(old_ith_input, scharr_gx.get(), scharr_gy.get(), border_mode, constant_border_value);
 
         // Init Lucas-Kanade kernel
-        _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
-                                     old_points, new_points_estimates, new_points,
-                                     &_old_points_internal, &_new_points_internal,
-                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
-                                     i, _num_levels, pyr_scale);
+        auto kernel_tracker = support::cpp14::make_unique<NELKTrackerKernel>();
+        kernel_tracker->configure(old_ith_input, new_ith_input, scharr_gx.get(), scharr_gy.get(),
+                                  old_points, new_points_estimates, new_points,
+                                  &_old_points_internal, &_new_points_internal,
+                                  termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                  i, _num_levels, pyr_scale);
 
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
+        scharr_gx->allocator()->allocate();
+        scharr_gy->allocator()->allocate();
+
+        _func_scharr.emplace_back(std::move(func_scharr));
+        _kernel_tracker.emplace_back(std::move(kernel_tracker));
+        _scharr_gx.emplace_back(std::move(scharr_gx));
+        _scharr_gy.emplace_back(std::move(scharr_gy));
     }
 }
 
@@ -121,16 +130,14 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
-        _func_scharr[level - 1].run();
+        _func_scharr[level - 1].get()->run();
 
         // Run Lucas-Kanade kernel
-        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+        NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
     }
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index f5c2718..c608edf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
@@ -61,18 +60,28 @@
 
     return coords;
 }
+
+uint32_t last_padding_dimension(const PaddingList &padding)
+{
+    int last_padding_dim = padding.size() - 1;
+    for(; last_padding_dim >= 0; --last_padding_dim)
+    {
+        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        {
+            break;
+        }
+    }
+    return static_cast<uint32_t>(last_padding_dim);
+}
 } // namespace
 
 NEPadLayer::NEPadLayer()
-    : _memset_kernel(), _copy_kernel(), _output_subtensor()
+    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
 {
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
-
     // Auto-init
     auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
 
@@ -86,23 +95,235 @@
     _copy_kernel.configure(input, &_output_subtensor);
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
+{
+    // Reflecting can be performed by effectively unfolding the input as follows:
+    // For each dimension starting at DimX:
+    //      For before and after:
+    //          Use strided slice to extract and reverse the part of the
+    //          input / previously produced tensor required for the padding.
+    //      Concatenate the before and after padding with the input / previously
+    //      produced tensor along the current dimension.
+
+    // Two strided slice functions will be required for each dimension padded as well as a
+    // concatenate function and the tensors to hold the temporary results.
+    _slice_functions.resize(2 * _num_dimensions);
+    _slice_results.resize(2 * _num_dimensions);
+    _concat_functions.resize(_num_dimensions);
+    _concat_results.resize(_num_dimensions - 1);
+
+    Coordinates starts_before{};
+    Coordinates ends_before{};
+    Coordinates starts_after{};
+    Coordinates ends_after{};
+    Coordinates strides{};
+    ITensor    *prev = input;
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+        if(i > 0)
+        {
+            strides.set(i - 1, 1);
+        }
+
+        if(_padding[i].first > 0 || _padding[i].second > 0)
+        {
+            // Set the starts, ends, and strides values for the current dimension.
+            // Due to the bit masks passed to strided slice, the values below the current dimension in
+            // starts and ends will be ignored so do not need to be modified.
+            if(_mode == PaddingMode::REFLECT)
+            {
+                starts_before.set(i, _padding[i].first);
+                ends_before.set(i, 0);
+                starts_after.set(i, input->info()->dimension(i) - 2);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+                strides.set(i, -1);
+            }
+            else
+            {
+                starts_before.set(i, _padding[i].first - 1);
+                ends_before.set(i, -1);
+                starts_after.set(i, input->info()->dimension(i) - 1);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+                strides.set(i, -1);
+            }
+
+            // Strided slice wraps negative indexes around to the end of the range,
+            // instead this should indicate use of the full range and so the bit mask will be modified.
+            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+            // Reflect the input values for the padding before and after the input.
+            std::vector<ITensor *> concat_vector;
+            if(_padding[i].first > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    concat_vector.emplace_back(&_slice_results[2 * i]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            concat_vector.push_back(prev);
+            if(_padding[i].second > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    concat_vector.emplace_back(&_slice_results[2 * i + 1]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            // Concatenate the padding before and after with the input.
+            ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+            _concat_functions[i].configure(concat_vector, out, i);
+            if(i != _num_dimensions - 1)
+            {
+                _concat_results[i].allocator()->allocate();
+            }
+            prev = out;
+        }
+        _slice_results[2 * i].allocator()->allocate();
+        _slice_results[2 * i + 1].allocator()->allocate();
+    }
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+    _padding = padding;
+    _mode    = mode;
+
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+    _num_dimensions = last_padding_dimension(padding) + 1;
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                configure_constant_mode(input, output, padding, constant_value);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                configure_reflect_symmetric_mode(input, output);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        // Copy the input to the whole output if no padding is applied
+        _copy_kernel.configure(input, output);
+    }
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    auto output_clone = output->clone();
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
 
+    switch(mode)
+    {
+        case PaddingMode::CONSTANT:
+        {
+            auto          output_clone = output->clone();
+            SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+            ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+            break;
+        }
+        case PaddingMode::REFLECT:
+        case PaddingMode::SYMMETRIC:
+        {
+            for(uint32_t i = 0; i < padding.size(); ++i)
+            {
+                if(mode == PaddingMode::REFLECT)
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+                }
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid mode");
+        }
+    }
     return Status{};
 }
 
 void NEPadLayer::run()
 {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+                NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                {
+                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    {
+                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i].run();
+                        }
+                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i + 1].run();
+                        }
+                        _concat_functions[i].run();
+                    }
+                }
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index cf6b984..ef28fe9 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
@@ -51,3 +51,27 @@
 {
     return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 8f7db96..65873b1 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,22 +26,13 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
-NEQuantizationLayer::NEQuantizationLayer()
-    : _quantize_kernel(), _min_max_kernel(), _min_max()
-{
-}
-
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-    TensorInfo min_max{ input->num_channels(), input->data_type() };
-    ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
 
     return Status{};
 }
@@ -50,24 +41,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
-    _min_max_kernel.configure(input, &_min_max);
-
     // Configure quantize kernel
-    _quantize_kernel.configure(input, output, &_min_max);
-
-    // Allocate min_max tensor
-    _min_max.allocator()->allocate();
-}
-
-void NEQuantizationLayer::run()
-{
-    // Reset min and max
-    _min_max_kernel.reset();
-
-    // Run min and max kernel
-    NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
-
-    // Run quantize kernel
-    NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+    auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 995d5ee..9ca7ded 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,7 +104,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _fully_connected_kernel.run();
 
@@ -115,8 +115,6 @@
 
     // copy hidden out to output
     NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 014895f..0b145f0 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -37,6 +38,8 @@
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
     TensorShape        out_shape     = input->tensor_shape();
@@ -78,10 +81,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    _reduction_ops     = reduction_axis.num_dimensions();
-    _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
-    _reduced_outs      = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-    _keep_dims         = keep_dims;
+    _reduction_ops = reduction_axis.num_dimensions();
+    _reduction_kernels.resize(_reduction_ops);
+    _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims = keep_dims;
 
     Coordinates        axis_local    = reduction_axis;
     const int          input_dims    = input->info()->num_dimensions();
@@ -96,9 +99,9 @@
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
@@ -107,8 +110,8 @@
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
-            _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+            _memory_group.manage(&_reduced_outs[i]);
+            _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -131,13 +134,13 @@
             out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
     }
 }
 
 void NEReduceMean::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
@@ -148,5 +151,4 @@
     {
         _reshape.run();
     }
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9f81a40..a0aed96 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -66,7 +66,8 @@
 
 void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
 
     // Configure reduction kernel
     _reduction_kernel.configure(input, output, axis, op);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 483aa4c..425ee6c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -97,14 +97,17 @@
       _dx(),
       _dy(),
       _scale_kernel(),
-      _border_handler()
+      _border_handler(),
+      _use_padding(true)
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
+    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy, use_padding));
+
+    _use_padding = use_padding;
 
     // Get data layout and width/height indices
     const DataLayout data_layout = input->info()->data_layout();
@@ -134,7 +137,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -152,7 +155,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -165,18 +168,20 @@
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode, constant_border_value);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-
-    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+    if(use_padding)
+    {
+        _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+    }
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
@@ -213,12 +218,15 @@
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
-                                                        policy, border_mode, sampling_policy));
+                                                        policy, border_mode, constant_border_value, sampling_policy, use_padding));
     return Status{};
 }
 
 void NEScale::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    if(_use_padding)
+    {
+        NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    }
     NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index d8f4eda..2ddfee5 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 5b6f60b..b47a37a 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
-    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 36b7d47..79a9496 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -180,7 +180,7 @@
 
 void NESoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_flattening)
     {
@@ -195,7 +195,5 @@
     {
         NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
     }
-
-    _memory_group.release();
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
new file mode 100644
index 0000000..46c28ad
--- /dev/null
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayer::NESpaceToBatchLayer()
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
+    }
+    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
+    }
+    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                     const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
+}
+
+void NESpaceToBatchLayer::run()
+{
+    // Zero out output only if we have paddings
+    if(_has_padding)
+    {
+        NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+    }
+    NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index e947657..0373ab6 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -42,8 +42,8 @@
 void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
 {
     // Create Slice functions
-    _num_outputs     = outputs.size();
-    _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+    _num_outputs = outputs.size();
+    _slice_functions.resize(_num_outputs);
 
     // Get output shape
     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 2f49c22..32350b0 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -43,8 +43,8 @@
 
 void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
 {
-    _num_inputs    = input.size();
-    _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+    _num_inputs = input.size();
+    _stack_kernels.resize(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 7532020..21f35f8 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -74,7 +74,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_axis(axis, input->info());
     _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
-    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+    _strided_slice_vector.resize(_num_slices);
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 7e435c3..25b5216 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -40,14 +40,15 @@
 {
 }
 
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+template <typename TensorInfoType, typename>
+inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int width_offset = 0;
@@ -60,8 +61,8 @@
 
     return Status{};
 }
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+template <typename TensorType, typename>
+inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
 {
     _num_inputs = inputs_vector.size();
 
@@ -70,7 +71,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -78,7 +79,7 @@
 
     unsigned int width_offset = 0;
 
-    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+    _concat_kernels_vector.resize(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
@@ -87,10 +88,30 @@
     }
 }
 
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    configure_internal(std::move(inputs_vector), output);
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
+{
+    configure_internal(std::move(inputs_vector), output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    return validate_internal(inputs_vector, output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    return validate_internal(inputs_vector, output);
+}
+
 void NEWidthConcatenateLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {
-        NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e37f8ab..1513786 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
 {
@@ -162,7 +162,7 @@
     const int        in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
     const int        in_batches  = input->info()->dimension(3);
 
-    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -234,12 +234,12 @@
 
 } //namespace
 
-NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
-      _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
-      _is_prepared(false), _is_activationlayer_enabled(false)
+      _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
+      _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
 {
-} /* arm_compute */
+}
 
 void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
                                            bool enable_fast_math)
@@ -380,20 +380,17 @@
     // Kernel Storage
     const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
                                                                                          in_channels)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
 
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
                                                                                      use_same_padding)
-                                      * data_type_size
-                                      + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                      * data_type_size;
 
     // Output storage
     const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
                                                                                         use_same_padding)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
     ;
     const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
     const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -431,14 +428,16 @@
     d_strides.set(2, 0);
     d_strides.set(3, data_type_size * output_matrix_stride);
 
-    TensorInfo a_info, b_info, d_info;
+    TensorInfo a_info{};
+    TensorInfo b_info{};
+    TensorInfo d_info{};
     a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
     b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
     d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
 
-    _input_workspace.allocator()->init(a_info, storage_alignment);
+    _input_transformed.allocator()->init(a_info, storage_alignment);
     _kernel_storage.allocator()->init(b_info, storage_alignment);
-    _output_workspace.allocator()->init(d_info, storage_alignment);
+    _output_transformed.allocator()->init(d_info, storage_alignment);
 
     // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
     TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
@@ -446,47 +445,58 @@
                     1, _output->info()->data_type());
     _output_nhwc.allocator()->init(info);
 
-    // Configure the InputTransform
-    _memory_group.manage(&_input_workspace);
-    _memory_group.manage(&_output_workspace);
+    const ITensor     *input_to_use  = _input;
+    ITensor           *output_to_use = _output;
+    PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
+    const unsigned int max_num_threads = NEScheduler::get().num_threads();
 
+    // Configure the kernel to transform the input tensor from NCHW -> NHWC
     if(data_layout == DataLayout::NCHW)
     {
-        // configure the kernel to transform the input tensor from NCHW -> NHWC
+        _memory_group.manage(&_input_nhwc);
         _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-        _input_nhwc.allocator()->allocate();
-        transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_workspace, input_matrix_stride);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-        _memory_group.manage(&_output_nhwc);
-        transform_output_kernel->configure(biases, &_output_workspace,
-                                           output_matrix_stride, &_output_nhwc,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-    }
-    else
-    {
-        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_workspace, input_matrix_stride);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        transform_output_kernel->configure(biases, &_output_workspace,
-                                           output_matrix_stride, _output,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+        input_to_use               = &_input_nhwc;
+        weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
     }
 
-    _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
+    // Configure input transform kernel
+    _memory_group.manage(&_input_transformed);
+    _memory_group.manage(&_input_workspace);
+    transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      &_input_transformed, input_matrix_stride, &_input_workspace);
+    const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
+    _input_workspace.allocator()->init(input_workspace_info);
     _input_workspace.allocator()->allocate();
+    if(data_layout == DataLayout::NCHW)
+    {
+        _input_nhwc.allocator()->allocate();
+    }
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
+    transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
+    // Configure GEMM function
+    _memory_group.manage(&_output_transformed);
+    _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
+    _input_transformed.allocator()->allocate();
+
+    // Configure output transform function
+    // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    if(data_layout == DataLayout::NCHW)
+    {
+        _memory_group.manage(&_output_nhwc);
+        output_to_use = &_output_nhwc;
+    }
+    transform_output_kernel->configure(biases, &_output_transformed,
+                                       output_matrix_stride, output_to_use,
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+    const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
+    _output_workspace.allocator()->init(output_workspace_info);
     _output_workspace.allocator()->allocate();
+    _output_transformed.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
     if(data_layout == DataLayout::NCHW)
@@ -513,7 +523,7 @@
 
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(data_layout == DataLayout::NCHW)
     {
@@ -526,6 +536,7 @@
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     _gemm_function.run();
+
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
@@ -539,8 +550,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
new file mode 100644
index 0000000..049bf66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor      *input,
+                                                                   const ITensor      *weights,
+                                                                   ITensor            *output,
+                                                                   PadStrideInfo       conv_info,
+                                                                   ActivationLayerInfo act_info)
+{
+    const DataType    data_type = input->info()->data_type();
+    const TensorShape shape     = input->info()->tensor_shape();
+
+    const int n_batches      = shape[3];
+    const int in_rows        = shape.z();
+    const int in_cols        = shape.y();
+    const int n_channels     = shape.x();
+    const int padding_top    = conv_info.pad_top();
+    const int padding_left   = conv_info.pad_left();
+    const int padding_bottom = conv_info.pad_bottom();
+    const int padding_right  = conv_info.pad_right();
+
+    const unsigned int stride_x = conv_info.stride().first;
+
+    // Map activation function
+    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+    if(arm_compute::utils::info_helpers::is_relu(act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU;
+    }
+    else if(arm_compute::utils::info_helpers::is_relu6(act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+    }
+
+    // Create quantized convolver
+    if(data_type == DataType::QASYMM8)
+    {
+        const QuantizationInfo &input_qinfo   = input->info()->quantization_info();
+        const QuantizationInfo &weights_qinfo = weights->info()->quantization_info();
+        const QuantizationInfo &output_qinfo  = output->info()->quantization_info();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int         qmultiplier = 0;
+        int         qshift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+        // Create convolver
+        switch(stride_x)
+        {
+            case 1:
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+            case 2:
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+            default:
+                return nullptr;
+        }
+    }
+    else
+    {
+        // Create float convolver
+        switch(data_type)
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                switch(stride_x)
+                {
+                    case 1:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    case 2:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    default:
+                        return nullptr;
+                }
+                break;
+            }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F32:
+            {
+                switch(stride_x)
+                {
+                    case 1:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    case 2:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    default:
+                        return nullptr;
+                }
+                break;
+            }
+            default:
+                return nullptr;
+        }
+    }
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
+      _dwc_acl_kernel()
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
+                                                       const ITensor             *weights,
+                                                       const ITensor             *bias,
+                                                       ITensor                   *output,
+                                                       const PadStrideInfo       &conv_info,
+                                                       unsigned int               depth_multiplier,
+                                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(depth_multiplier);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
+                                                                                weights->info(),
+                                                                                bias != nullptr ? bias->info() : nullptr,
+                                                                                output->info(),
+                                                                                conv_info,
+                                                                                depth_multiplier,
+                                                                                act_info));
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+    _input       = input;
+    _weights     = weights;
+    _bias        = bias;
+    _output      = output;
+    _is_prepared = false;
+
+    // Create convolver
+    _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
+    ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+
+    // Create assembly kernel wrapper
+    _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+
+    constexpr size_t alignment = 128;
+
+    // Create workspace
+    const unsigned int num_threads    = NEScheduler::get().num_threads();
+    const size_t       workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+    _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
+    _memory_group.manage(&_workspace);
+    _workspace.allocator()->allocate();
+
+    // Create packing tensor
+    const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+    _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
+}
+
+Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo         *input,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *bias,
+                                                        const ITensorInfo         *output,
+                                                        const PadStrideInfo       &conv_info,
+                                                        unsigned int               depth_multiplier,
+                                                        const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const auto       strides     = conv_info.stride();
+    const DataLayout data_layout = input->data_layout();
+    unsigned int     width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    unsigned int     height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
+    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+
+    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
+
+    // Check bias
+    if(bias != nullptr)
+    {
+        unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    // Check output
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
+                                                                    const ITensorInfo *weights,
+                                                                    PadStrideInfo      conv_info,
+                                                                    unsigned int       depth_multiplier,
+                                                                    const Size2D      &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+
+    // Reshape input shape if in NHWC format
+    const DataLayout data_layout = input->data_layout();
+    TensorShape      in_shape{ input->tensor_shape() };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, input->tensor_shape().y());
+        in_shape.set(Window::DimY, input->tensor_shape().z());
+        in_shape.set(Window::DimZ, input->tensor_shape().x());
+    }
+
+    // Check data type
+    const DataType data_type          = weights->data_type();
+    bool           is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
+
+    // Check weighs size
+    const unsigned int width_idx         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    bool               weights_supported = (weights->dimension(width_idx) == 3) && (weights->dimension(height_idx) == 3);
+
+    // Check for supported strides
+    const auto &strides           = conv_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = conv_info.pad_top();
+    const auto    pad_right         = conv_info.pad_right();
+    const auto    pad_bottom        = conv_info.pad_bottom();
+    const auto    pad_left          = conv_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+    bool          is_dilation_1     = dilation.x() == 1 && dilation.y() == 1;
+
+    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::run()
+{
+    // Prepare assembly kernel
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Setup inputs/outputs
+    ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
+    _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+    const int   input_element_size = _input->info()->element_size();
+    const int   input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
+    const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
+    const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
+    const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
+    _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    const int output_element_size = _output->info()->element_size();
+    const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
+    const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
+    const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
+    void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
+    _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::prepare()
+{
+    if(!_is_prepared)
+    {
+        _packed_weights.allocator()->allocate();
+        ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
+
+        // Pack weights and bias
+        const int weights_element_size = _weights->info()->element_size();
+        const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
+        const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
+        _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+                                          _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+                                          weights_row_stride,
+                                          weights_col_stride,
+                                          (_bias != nullptr) ? _bias->buffer() : nullptr);
+        _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+
+        _weights->mark_as_unused();
+        if(_bias != nullptr)
+        {
+            _bias->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index 34aaea0..e207ab0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -183,9 +183,8 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
-    _memory_group.release();
 }
 
 void NEGEMMInterleavedWrapper::prepare()
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index ebd6570..bc7b550 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,8 @@
 
 void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
-    _info    = info;
-    _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
+    _info = info;
+    _pyramid.resize(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
@@ -56,11 +56,11 @@
     TensorShape tensor_shape = _info.tensor_shape();
 
     // Note: Look-up table used by the OpenVX sample implementation
-    const float c_orbscale[4] = { 0.5f,
-                                  SCALE_PYRAMID_ORB,
-                                  SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
-                                  SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
-                                };
+    const std::array<float, 4> c_orbscale = { 0.5f,
+                                              SCALE_PYRAMID_ORB,
+                                              SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+                                              SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+                                            };
 
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
@@ -71,7 +71,7 @@
             tensor_info.auto_padding();
         }
 
-        (_pyramid.get() + i)->allocator()->init(tensor_info);
+        _pyramid[i].allocator()->init(tensor_info);
 
         if(is_orb_scale)
         {
@@ -99,11 +99,9 @@
 
 void Pyramid::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
-        (_pyramid.get() + i)->allocator()->allocate();
+        _pyramid[i].allocator()->allocate();
     }
 }
 
@@ -116,5 +114,5 @@
 {
     ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
 
-    return (_pyramid.get() + index);
-}
+    return &_pyramid[index];
+}
\ No newline at end of file
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 38edb8b..0612d75 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -149,11 +149,11 @@
     info().set_is_resizable(true);
 }
 
-arm_compute::Status TensorAllocator::import_memory(void *memory, size_t size)
+Status TensorAllocator::import_memory(void *memory)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(size == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(alignment() != 0 && !arm_compute::utility::check_aligned(memory, alignment()));
 
     _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
     info().set_is_resizable(false);