arm_compute v19.02 Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c

commit: 514be65ad8d3340f53fd9591035352ed285811ba [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu Feb 28 12:25:18 2019 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Thu Feb 28 13:38:08 2019 +0000
tree: abe236598d76078a537fd247813e287d5bf34acd
parent: 3d2d44ef55ab6b08afda8be48301ce3c55c7bc67 [diff]
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 0947d58..18ef185 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -148,7 +148,7 @@
     const GPUTarget gpu_target  = get_target_from_name(device_name);
 
     // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
-    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
     return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
 }
 
@@ -230,4 +230,29 @@
         return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
     }
 }
+
+size_t preferred_vector_width(const cl::Device &device, const DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QASYMM8:
+            return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
+        case DataType::U16:
+        case DataType::S16:
+            return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
+        case DataType::U32:
+        case DataType::S32:
+            return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
+        case DataType::F16:
+        case DataType::F32:
+            return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
+        case DataType::U64:
+        case DataType::S64:
+            return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG>();
+        default:
+            return 1;
+    }
+}
 } // namespace arm_compute

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index ff4803e..4ecb885 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -149,11 +149,7 @@
     { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
     { "activation_layer_qa8", "activation_layer_qa8.cl" },
-    { "arithmetic_add_quantized", "arithmetic_op_quantized.cl" },
-    { "arithmetic_add", "arithmetic_op.cl" },
-    { "arithmetic_sub", "arithmetic_op.cl" },
-    { "arithmetic_sub_quantized", "arithmetic_op_quantized.cl" },
-    { "arithmetic_div", "arithmetic_op.cl" },
+    { "activation_layer_logistic_qa8", "activation_layer_qa8.cl" },
     { "batch_to_space_nchw", "batch_to_space.cl" },
     { "batch_to_space_static_nchw", "batch_to_space.cl" },
     { "batch_to_space_nhwc", "batch_to_space.cl" },
@@ -180,6 +176,18 @@
     { "channel_extract_YUYV422", "channel_extract.cl" },
     { "combine_gradients_L1", "canny.cl" },
     { "combine_gradients_L2", "canny.cl" },
+    { "compare_equal", "comparisons.cl" },
+    { "compare_equal_quantized", "comparisons.cl" },
+    { "compare_notequal", "comparisons.cl" },
+    { "compare_notequal_quantized", "comparisons.cl" },
+    { "compare_greater", "comparisons.cl" },
+    { "compare_greater_quantized", "comparisons.cl" },
+    { "compare_greaterequal", "comparisons.cl" },
+    { "compare_greaterequal_quantized", "comparisons.cl" },
+    { "compare_less", "comparisons.cl" },
+    { "compare_less_quantized", "comparisons.cl" },
+    { "compare_lessequal", "comparisons.cl" },
+    { "compare_lessequal_quantized", "comparisons.cl" },
     { "concatenate_depth", "concatenate.cl" },
     { "concatenate_width", "concatenate.cl" },
     { "concatenate_width_x2", "concatenate.cl" },
@@ -218,9 +226,10 @@
     { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl" },
+    { "depthwise_convolution_reshape_weights", "depthwise_convolution.cl" },
+    { "depthwise_convolution_reshape_weights_generic", "depthwise_convolution.cl" },
     { "depthwise_im2col", "depthwise_convolution.cl" },
     { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
-    { "depthwise_weights_reshape", "depthwise_convolution.cl" },
     { "dequantization_layer", "dequantization_layer.cl" },
     { "derivative", "derivative.cl" },
     { "dilate", "dilate.cl" },
@@ -234,6 +243,19 @@
     { "direct_convolution5x5_nhwc", "direct_convolution5x5.cl" },
     { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
     { "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
+    { "elementwise_operation_ADD", "elementwise_operation.cl" },
+    { "elementwise_operation_SUB", "elementwise_operation.cl" },
+    { "elementwise_operation_MAX", "elementwise_operation.cl" },
+    { "elementwise_operation_MIN", "elementwise_operation.cl" },
+    { "elementwise_operation_DIV", "elementwise_operation.cl" },
+    { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
+    { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
+    { "elementwise_unary", "elementwise_unary.cl" },
     { "erode", "erode.cl" },
     { "fast_corners", "fast_corners.cl" },
     { "flatten", "flatten.cl" },
@@ -242,6 +264,7 @@
     { "finalize", "optical_flow_pyramid_lk.cl" },
     { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
     { "floor_layer", "floor.cl" },
+    { "gather", "gather.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_accumulate_biases", "gemm.cl" },
@@ -260,8 +283,13 @@
     { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_transpose1xW", "gemm.cl" },
+    { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
+    { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
+    { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
+    { "gemm_reshape_rhs_matrix_t", "gemm.cl" },
     { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
     { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
@@ -271,6 +299,8 @@
     { "gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl" },
     { "gemmlowp_mm_interleaved_transposed_bifrost_dot8", "gemmlowp.cl" },
     { "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
@@ -296,6 +326,7 @@
     { "im2col_generic_nchw", "im2col.cl" },
     { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
     { "im2col3x3_nhwc", "im2col.cl" },
+    { "im2col9x9_nhwc", "im2col.cl" },
     { "im2col_generic_nhwc", "im2col.cl" },
     { "init_level", "optical_flow_pyramid_lk.cl" },
     { "init_level_max", "optical_flow_pyramid_lk.cl" },
@@ -326,7 +357,8 @@
     { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
-    { "normalization_layer_in_map", "normalization_layer.cl" },
+    { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
+    { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
     { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
@@ -340,9 +372,7 @@
     { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
-    { "permute_201", "permute.cl" },
-    { "permute_120", "permute.cl" },
-    { "permute_3201", "permute.cl" },
+    { "permute", "permute.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
@@ -355,10 +385,11 @@
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
     { "prior_box_layer_nchw", "prior_box_layer.cl" },
-    { "prior_box_layer_nhwc", "prior_box_layer.cl" },
     { "quantization_layer", "quantization_layer.cl" },
+    { "range", "range.cl" },
+    { "range_quantized", "range.cl" },
     { "reduction_operation_x", "reduction_operation.cl" },
-    { "reduction_operation_quantized_x", "reduction_operation.cl" },
+    { "reduction_operation_non_parallel_x", "reduction_operation.cl" },
     { "reduction_operation_y", "reduction_operation.cl" },
     { "reduction_operation_z", "reduction_operation.cl" },
     { "reduction_operation_w", "reduction_operation.cl" },
@@ -368,6 +399,7 @@
     { "reorg_layer_nhwc", "reorg_layer.cl" },
     { "reshape_layer", "reshape_layer.cl" },
     { "reshape_to_columns", "convolution_layer.cl" },
+    { "reverse", "reverse.cl" },
     { "RGB888_to_IYUV_bt709", "color_convert.cl" },
     { "RGB888_to_NV12_bt709", "color_convert.cl" },
     { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
@@ -386,6 +418,9 @@
     { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
     { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
     { "scharr3x3", "scharr_filter.cl" },
+    { "select_same_rank", "select.cl" },
+    { "select_different_rank_2", "select.cl" },
+    { "select_different_rank_n", "select.cl" },
     { "sobel3x3", "sobel_filter.cl" },
     { "sobel_separable5x1", "sobel_filter.cl" },
     { "sobel_separable1x5", "sobel_filter.cl" },
@@ -401,12 +436,14 @@
     { "space_to_batch_nhwc", "space_to_batch.cl" },
     { "space_to_batch_static_nhwc", "space_to_batch.cl" },
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
+    { "stack_layer", "stack_layer.cl" },
     { "strided_slice", "slice_ops.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
     { "threshold_binary", "threshold.cl" },
     { "threshold_range", "threshold.cl" },
+    { "tile", "tile.cl" },
     { "transpose", "transpose.cl" },
     { "UYVY422_to_IYUV_bt709", "color_convert.cl" },
     { "UYVY422_to_NV12_bt709", "color_convert.cl" },
@@ -494,14 +531,6 @@
 #include "./cl_kernels/activation_layer_qa8.clembed"
     },
     {
-        "arithmetic_op.cl",
-#include "./cl_kernels/arithmetic_op.clembed"
-    },
-    {
-        "arithmetic_op_quantized.cl",
-#include "./cl_kernels/arithmetic_op_quantized.clembed"
-    },
-    {
         "batch_to_space.cl",
 #include "./cl_kernels/batch_to_space.clembed"
     },
@@ -534,6 +563,10 @@
 #include "./cl_kernels/col2im.clembed"
     },
     {
+        "comparisons.cl",
+#include "./cl_kernels/comparisons.clembed"
+    },
+    {
         "concatenate.cl",
 #include "./cl_kernels/concatenate.clembed"
     },
@@ -622,6 +655,18 @@
 #include "./cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.clembed"
     },
     {
+        "elementwise_operation.cl",
+#include "./cl_kernels/elementwise_operation.clembed"
+    },
+    {
+        "elementwise_operation_quantized.cl",
+#include "./cl_kernels/elementwise_operation_quantized.clembed"
+    },
+    {
+        "elementwise_unary.cl",
+#include "./cl_kernels/elementwise_unary.clembed"
+    },
+    {
         "erode.cl",
 #include "./cl_kernels/erode.clembed"
     },
@@ -642,6 +687,10 @@
 #include "./cl_kernels/floor.clembed"
     },
     {
+        "gather.cl",
+#include "./cl_kernels/gather.clembed"
+    },
+    {
         "gaussian_pyramid.cl",
 #include "./cl_kernels/gaussian_pyramid.clembed"
     },
@@ -778,6 +827,10 @@
 #include "./cl_kernels/quantization_layer.clembed"
     },
     {
+        "range.cl",
+#include "./cl_kernels/range.clembed"
+    },
+    {
         "reduction_operation.cl",
 #include "./cl_kernels/reduction_operation.clembed"
     },
@@ -794,6 +847,10 @@
 #include "./cl_kernels/reshape_layer.clembed"
     },
     {
+        "reverse.cl",
+#include "./cl_kernels/reverse.clembed"
+    },
+    {
         "roi_align_layer.cl",
 #include "./cl_kernels/roi_align_layer.clembed"
     },
@@ -814,6 +871,10 @@
 #include "./cl_kernels/scharr_filter.clembed"
     },
     {
+        "select.cl",
+#include "./cl_kernels/select.clembed"
+    },
+    {
         "sobel_filter.cl",
 #include "./cl_kernels/sobel_filter.clembed"
     },
@@ -834,6 +895,10 @@
 #include "./cl_kernels/space_to_batch.clembed"
     },
     {
+        "stack_layer.cl",
+#include "./cl_kernels/stack_layer.clembed"
+    },
+    {
         "tablelookup.cl",
 #include "./cl_kernels/tablelookup.clembed"
     },
@@ -842,6 +907,10 @@
 #include "./cl_kernels/threshold.clembed"
     },
     {
+        "tile.cl",
+#include "./cl_kernels/tile.clembed"
+    },
+    {
         "transpose.cl",
 #include "./cl_kernels/transpose.clembed"
     },

diff --git a/src/core/CL/cl_kernels/activation_helpers.h b/src/core/CL/cl_kernels/activation_helpers.h
index dfab082..9d4af84 100644
--- a/src/core/CL/cl_kernels/activation_helpers.h
+++ b/src/core/CL/cl_kernels/activation_helpers.h

@@ -70,7 +70,7 @@
 // Soft RELU Activation
 inline TYPE srelu_op(TYPE x)
 {
-    return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+    return CONVERT(LOG_OP(ADD_OP((VEC_DATA_TYPE(float, VEC_SIZE))CONST_ONE, EXP_OP(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE))))), TYPE);
 }
 // Absolute Activation
 inline TYPE abs_op(TYPE x)

diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index 8f6a807..cfb6137 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,16 +26,6 @@
 #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
 
-// Logistic Activation
-inline TYPE logistic_op(TYPE x)
-{
-    VEC_FLOAT x_flt = CONVERT(x, VEC_FLOAT);
-    x_flt           = round(x_flt - (float)O1_VAL) * ((float)S1_VAL);
-    x_flt           = 1.f / (1.f + exp(-x_flt));
-
-    const TYPE x_u8 = CONVERT_SAT(round(x_flt / ((float)S1_VAL)) + (float)O1_VAL, TYPE);
-    return x_u8;
-}
 // RELU Activation
 inline TYPE relu_op(TYPE x)
 {
@@ -95,14 +85,14 @@
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
  */
 __kernel void activation_layer_qa8(
     TENSOR3D_DECLARATION(input)
@@ -131,3 +121,69 @@
 }
 
 #endif /* defined(ACT) */
+
+#if defined(O2_VAL) && defined(S2_VAL)
+#define OFFSET_OUT O2_VAL
+#define SCALE_OUT S2_VAL
+#else // defined(O2_VAL) && defined(S2_VAL)
+#define OFFSET_OUT O1_VAL
+#define SCALE_OUT S1_VAL
+#endif // defined(O2_VAL) && defined(S2_VAL)
+
+/** This performs a Logistic activation function on QASYMM8 inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_logistic_qa8(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+    data_flt           = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+    data_flt           = 1.f / (1.f + exp(-data_flt));
+
+    data = CONVERT_SAT(round(data_flt / ((float)SCALE_OUT)) + (float)OFFSET_OUT, TYPE);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
deleted file mode 100644
index 557615e..0000000
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ /dev/null

@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-#define DIV(x, y) (x) / (y)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
-/** This function adds two tensors.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_add(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
-
-/** This function subtracts one tensor from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_sub(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    // Calculate and store result
-    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-
-/** This function divides one tensor from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=float -DDATA_TYPE_IN2=float -DDATA_TYPE_OUT=float
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: Same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: Same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_div(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
-    // Calculate and store result
-    vstore16(DIV(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}

diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
index 0972355..e6f470a 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/comparisons.cl
similarity index 68%
rename from src/core/CL/cl_kernels/arithmetic_op_quantized.cl
rename to src/core/CL/cl_kernels/comparisons.cl
index fc7fa77..8824b13 100644
--- a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,37 +23,27 @@
  */
 #include "helpers.h"
 
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
 
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define DEFINE_KERNEL_STR(name) compare_##name
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
 
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
+#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
 
-#if defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-
-/** This function adds two tensors.
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
  *
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
  *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -61,7 +51,7 @@
  * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
  * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -69,7 +59,7 @@
  * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -78,7 +68,7 @@
  * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void arithmetic_add_quantized(
+__kernel void DEFINE_KERNEL(OP_NAME)(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
@@ -88,33 +78,25 @@
     Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
 
-    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
-    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
-    const VEC_FLOAT qresf32 = (in1f32 + in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
-    // Store result
+    // Calculate and store result
     VSTORE(VEC_SIZE)
-    (res, 0, (__global uchar *)out.ptr);
+    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
 }
-#endif /* defined(VEC_SIZE) */
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
 
-/** This function subtracts two tensors.
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
+/** This function compares two quantized tensors.
  *
  * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
  * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
  * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
  * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -141,7 +123,7 @@
  * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void arithmetic_sub_quantized(
+__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
@@ -154,15 +136,14 @@
     int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
     int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
 
-    in_a = SUB(in_a, (int16)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (int16)((int)OFFSET_IN2));
+    in_a = in_a - (int16)((int)OFFSET_IN1);
+    in_b = in_b - (int16)((int)OFFSET_IN2);
 
-    const float16 in1f32  = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32  = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const float16 qresf32 = (in1f32 - in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
-    const uchar16 res     = convert_uchar16_sat(convert_int16_rte(qresf32));
+    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const int16   res    = OP(in1f32, in2f32);
 
     // Store result
-    vstore16(res, 0, (__global uchar *)out.ptr);
+    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
 }
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 0e8805f..c374769 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,22 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_UCHAR requantize(VEC_UCHAR input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+    const VEC_UCHAR res_u8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_UCHAR);
+    return res_u8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
 #if defined(DEPTH) && defined(ELEMENT_SIZE)
 
 #if defined(INPUT1_WIDTH)
@@ -50,6 +64,7 @@
 #else // VEC_SIZE
 #error "Vector size not supported"
 #endif // VEC_SIZE
+
 /** This kernel concatenates two input tensors into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
@@ -88,11 +103,15 @@
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
+ * @param[in]  src1_pad_left                      Left paddings of the second input tensor in unit of elements
  */
 __kernel void concatenate_width_x2(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    uint src1_pad_right,
+    uint src2_pad_left)
 {
     Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
 
@@ -101,16 +120,22 @@
     const int y  = get_global_id(1);
     const int z  = get_global_id(2) % (int)DEPTH;
     const int w  = get_global_id(2) / (int)DEPTH;
-    const int x1 = min(x, (int)INPUT1_WIDTH);
-    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left);
 
     // Calculate inputs and output addresses
     const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
     const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
 
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
 
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */
     const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
     const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
@@ -180,13 +205,25 @@
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
+ * @param[in]  src2_pad_left                      Left paddings of the second input tensor in unit of elements
+ * @param[in]  src2_pad_right                     Right paddings of the second input tensor in unit of elements
+ * @param[in]  src3_pad_left                      Left paddings of the third input tensor in unit of elements
+ * @param[in]  src3_pad_right                     Right paddings of the third input tensor in unit of elements
+ * @param[in]  src4_pad_left                      Left paddings of the fourth input tensor in unit of elements
  */
 __kernel void concatenate_width_x4(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
     TENSOR4D_DECLARATION(src3),
     TENSOR4D_DECLARATION(src4),
-    TENSOR4D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(dst),
+    uint src1_pad_right,
+    uint src2_pad_left,
+    uint src2_pad_right,
+    uint src3_pad_left,
+    uint src3_pad_right,
+    uint src4_pad_left)
 {
     Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
 
@@ -196,10 +233,10 @@
     const int z = get_global_id(2) % (int)DEPTH;
     const int w = get_global_id(2) / (int)DEPTH;
 
-    const int x1 = min(x, (int)INPUT1_WIDTH);
-    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
-    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
-    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left), (int)INPUT2_WIDTH + (int)src2_pad_right - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)src3_pad_left), (int)INPUT3_WIDTH + (int)src3_pad_right - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)src4_pad_left);
 
     // Calculate inputs and output addresses
     const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
@@ -207,10 +244,21 @@
     const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
     const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
 
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
 
     const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
 
@@ -259,6 +307,7 @@
  * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
+
 __kernel void concatenate_width(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst))
@@ -269,9 +318,16 @@
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
 
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    VSTORE(VEC_SIZE)
+    (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
+
 #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
 
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
@@ -308,7 +364,12 @@
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
 
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
+
 }
 #endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */

diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
index 8c75ecd..625c6c4 100644
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ b/src/core/CL/cl_kernels/convolution3x3.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,7 +72,6 @@
  * @param[in] mat4  Coefficient from the convolution matrix
  * @param[in] mat5  Coefficient from the convolution matrix
  * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat0  Coefficient from the convolution matrix
  * @param[in] mat7  Coefficient from the convolution matrix
  * @param[in] mat8  Coefficient from the convolution matrix
  * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)

diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
index 605cd09..2c3cafa 100644
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ b/src/core/CL/cl_kernels/convolution5x5.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,7 +132,6 @@
  * @param[in] mat4  Coefficient from the convolution matrix
  * @param[in] mat5  Coefficient from the convolution matrix
  * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat0  Coefficient from the convolution matrix
  * @param[in] mat7  Coefficient from the convolution matrix
  * @param[in] mat8  Coefficient from the convolution matrix
  * @param[in] mat9  Coefficient from the convolution matrix
@@ -143,7 +142,6 @@
  * @param[in] mat14 Coefficient from the convolution matrix
  * @param[in] mat15 Coefficient from the convolution matrix
  * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
  * @param[in] mat17 Coefficient from the convolution matrix
  * @param[in] mat18 Coefficient from the convolution matrix
  * @param[in] mat19 Coefficient from the convolution matrix

diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
index 1abfb15..9dd6a88 100644
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ b/src/core/CL/cl_kernels/convolution7x7.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -152,7 +152,6 @@
  * @param[in] mat4  Coefficient from the convolution matrix
  * @param[in] mat5  Coefficient from the convolution matrix
  * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat0  Coefficient from the convolution matrix
  * @param[in] mat7  Coefficient from the convolution matrix
  * @param[in] mat8  Coefficient from the convolution matrix
  * @param[in] mat9  Coefficient from the convolution matrix
@@ -163,7 +162,6 @@
  * @param[in] mat14 Coefficient from the convolution matrix
  * @param[in] mat15 Coefficient from the convolution matrix
  * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
  * @param[in] mat17 Coefficient from the convolution matrix
  * @param[in] mat18 Coefficient from the convolution matrix
  * @param[in] mat19 Coefficient from the convolution matrix

diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
index f537326..2a5f4a1 100644
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ b/src/core/CL/cl_kernels/convolution9x9.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,7 +173,6 @@
  * @param[in] mat4  Coefficient from the convolution matrix
  * @param[in] mat5  Coefficient from the convolution matrix
  * @param[in] mat6  Coefficient from the convolution matrix
- * @param[in] mat0  Coefficient from the convolution matrix
  * @param[in] mat7  Coefficient from the convolution matrix
  * @param[in] mat8  Coefficient from the convolution matrix
  * @param[in] mat9  Coefficient from the convolution matrix
@@ -184,7 +183,6 @@
  * @param[in] mat14 Coefficient from the convolution matrix
  * @param[in] mat15 Coefficient from the convolution matrix
  * @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
  * @param[in] mat17 Coefficient from the convolution matrix
  * @param[in] mat18 Coefficient from the convolution matrix
  * @param[in] mat19 Coefficient from the convolution matrix
@@ -245,7 +243,6 @@
  * @param[in] mat74 Coefficient from the convolution matrix
  * @param[in] mat75 Coefficient from the convolution matrix
  * @param[in] mat76 Coefficient from the convolution matrix
- * @param[in] mat76 Coefficient from the convolution matrix
  * @param[in] mat77 Coefficient from the convolution matrix
  * @param[in] mat78 Coefficient from the convolution matrix
  * @param[in] mat79 Coefficient from the convolution matrix

diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index 611449e..75192e6 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,80 +38,92 @@
 
 /** This function performs a down-scaling depth conversion.
  *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  shift                             The integer shift amount value. Supported data types: S32
  */
 __kernel void convert_depth_down(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
     const int shift)
 {
     // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
-    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
 
 #if defined(IS_DATA_TYPE_FLOAT)
-    const DATA_TYPE_IN scale = (DATA_TYPE_IN)(1 << shift);
-    vstore16(CONVERT_DOWN(in_data / scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
-    vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
 
 /** This function performs a up-scaling depth conversion.
  *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  shift                             The integer shift amount value. Supported data types: S32
  */
 __kernel void convert_depth_up(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
     const int shift)
 {
     // Get pixels pointer
-    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
-    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
 
 #if defined(IS_DATA_TYPE_FLOAT)
-    const DATA_TYPE_OUT scale = (DATA_TYPE_OUT)(1 << shift);
-    vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) * scale, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
-    vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index bfaa92b..4f6fdfa 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -464,6 +464,104 @@
 
 #endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+/** Reshape the weights for quantized depthwise convolution
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
+ * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
+ * @attention Input's height and width should be 3
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_convolution_reshape_weights(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Vector    src = CONVERT_TO_VECTOR_STRUCT(src);
+    const int x   = get_global_id(0);
+
+    // Load 3x3xVEC_SIZE weights
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
+
+#if defined(TRANSPOSE)
+#if VEC_SIZE != 4
+#error "VEC_SIZE not supported"
+#else  // VEC_SIZE != 4
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // VEC_SIZE != 4
+#else  // !defined(TRANSPOSE)
+    VSTORE(VEC_SIZE)
+    (w0, 0, dst_addr + 0);
+    VSTORE(VEC_SIZE)
+    (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+    VSTORE(VEC_SIZE)
+    (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // defined(TRANSPOSE)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+
 #if defined(NCHW)
 #define in_stride_x src_stride_x
 #define in_stride_y src_stride_y
@@ -504,7 +602,7 @@
  * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
  */
-__kernel void depthwise_weights_reshape(
+__kernel void depthwise_convolution_reshape_weights_generic(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst)
 #ifdef HAS_BIAS
@@ -1091,9 +1189,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
     int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
@@ -1240,9 +1338,9 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
     int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
@@ -1394,4 +1492,4 @@
 }
 
 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 5a732b4..606af2e 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -574,62 +574,25 @@
 #endif /* WEIGHTS_OFFSET != 0 */
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
-    ({                                                                                                             \
-        ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0);       \
-        ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0);       \
-        acc.s0 += val8.s0 * w8.s0;                                                                                 \
-        \
-        ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1);       \
-        ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1);       \
-        acc.s1 += val8.s1 * w8.s1;                                                                                 \
-        \
-        ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2);       \
-        ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2);       \
-        acc.s2 += val8.s2 * w8.s2;                                                                                 \
-        \
-        ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3);       \
-        ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3);       \
-        acc.s3 += val8.s3 * w8.s3;                                                                                 \
+#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
+    ({                                                                                 \
+        ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc);                      \
+        ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc);                      \
+        acc += val8 * w1;                                                              \
     })
 
-#if WEIGHTS_OFFSET != 0
-#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
-    ({                                                                                                                        \
-        ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0);                  \
-        ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0);                  \
-        ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0);                                                           \
-        \
-        ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1);                  \
-        ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1);                  \
-        ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1);                                                           \
-        \
-        ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2);                  \
-        ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2);                  \
-        ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2);                                                           \
-        \
-        ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3);                  \
-        ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3);                  \
-        ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3);                                                           \
-    })
-#else /* WEIGHTS_OFFSET != 0 */
-#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
-#endif /* WEIGHTS_OFFSET != 0 */
-
 #define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
     ({                                                                                   \
-        sum = CONVERT(val0, VEC_INT);                                                    \
-        ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0);        \
-        ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0);        \
-        \
-        ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1);        \
-        ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1);        \
-        \
-        ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2);        \
-        ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2);        \
-        \
-        ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3);        \
-        ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3);        \
+        sum = val0;                                                                      \
+        ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum);                       \
+        ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum);                       \
+    })
+
+#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
+    ({                                             \
+        sum = w1;                                  \
+        ARM_DOT(w0.s0123, (uchar4)1, sum);         \
+        ARM_DOT(w0.s4567, (uchar4)1, sum);         \
     })
 
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -637,6 +600,7 @@
 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
  *
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
@@ -664,13 +628,11 @@
  * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr                           Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -681,7 +643,7 @@
 __kernel void depthwise_convolution_3x3_quantized_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    IMAGE_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
 #endif /* defined(HAS_BIAS) */
@@ -692,11 +654,11 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else  /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
     int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
 
-    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
 #if defined(DST_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -716,19 +678,19 @@
 
     int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
 
-    // We compute 4x1x1 [C,W,H] elements
+    // We compute VEC_SIZEx1x1 [C,W,H] elements
     VEC_INT acc = 0, sum = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
+    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
+    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
+    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
+    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
+    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
+    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
+    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
+    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -824,8 +786,9 @@
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
 #if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
  *
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
  * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
@@ -858,8 +821,6 @@
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -871,7 +832,7 @@
 __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    IMAGE_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
 #endif /* defined(HAS_BIAS) */
@@ -882,11 +843,11 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else  /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
     int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
 
-    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
 #if defined(DST_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -913,15 +874,15 @@
     VEC_INT acc3 = 0, sum3 = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
+    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
+    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
+    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
+    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
+    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
+    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
+    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
+    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1103,9 +1064,11 @@
     }
 }
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
  *
+ * @note This kernel assumes VEC_SIZE is 4.
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
  * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
@@ -1140,8 +1103,6 @@
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: QASYMM8
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -1149,11 +1110,10 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  * @param[in] max_offset                            The maximum allowed offset for the input tensor
  */
-
 __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    IMAGE_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
 #endif // defined(HAS_BIAS)
@@ -1164,11 +1124,11 @@
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
     int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
 
-    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
 #if defined(DST_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -1195,19 +1155,16 @@
     VEC_INT sum1 = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+    uchar16 w0 = VLOAD(16)(0, weights_addr);
+    uchar16 w1 = VLOAD(16)(0, weights_addr + 16);
+    uchar4  w2 = VLOAD(4)(0, weights_addr + 32);
 
 #if INPUT_OFFSET != 0
     // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
-    DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
+    DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
 
     // Multiply the weights reduction with INPUT_OFFSET
     acc0 = INPUT_OFFSET * acc0;
@@ -1250,11 +1207,25 @@
     VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
     VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
 
-    DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);
-    DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+    DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
+    DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
+    DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
+    DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
 
-    DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);
-    DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+    DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
+    DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
+    DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+    DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+
+    DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
+    DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
+    DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
+    DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
+
+    DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
+    DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
+    DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
+    DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
 
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
@@ -1308,8 +1279,7 @@
     VSTORE(VEC_SIZE)
     (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
 }
-
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
 
 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
 

diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
new file mode 100644
index 0000000..00d7ed3
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_operation.cl

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define DIV(x, y) (x / y)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (OP(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */

diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
new file mode 100644
index 0000000..1f0533b
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @attention The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @attention The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @attention The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @attention The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @attention The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */

diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
new file mode 100644
index 0000000..92db569
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_unary.cl

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+/** Calculate reverse square root
+ *
+ * @param[in] input Pointer to the first element.
+ *
+ * @return reverse square root
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) inverse_sqrt(const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) input)
+{
+    return rsqrt(input);
+}
+
+/** Calculate exponential
+ *
+ * @param[in] input Pointer to the first element.
+ *
+ * @return exponential
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) exponential(const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) input)
+{
+    return exp(input);
+}
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+/** Calculate reverse square root
+ *
+ * @param[in] input Single element.
+ *
+ * @return reverse square root
+ */
+inline DATA_TYPE inverse_sqrt(const DATA_TYPE input)
+{
+    return rsqrt(input);
+}
+
+/** Calculate exponential
+ *
+ * @param[in] input Single element.
+ *
+ * @return exponential
+ */
+inline DATA_TYPE exponential(const DATA_TYPE input)
+{
+    return exp(input);
+}
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+    VECTOR_DECLARATION(in),
+    VECTOR_DECLARATION(out))
+{
+    Vector in  = CONVERT_TO_VECTOR_STRUCT(in);
+    Vector out = CONVERT_TO_VECTOR_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)

diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
new file mode 100644
index 0000000..d6fe52d
--- /dev/null
+++ b/src/core/CL/cl_kernels/gather.cl

@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
+ * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
+ * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+    TENSOR4D_DECLARATION(input),
+    VECTOR_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif AXIS == 1
+    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif AXIS == 2
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif AXIS == 3
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif //AXIS
+
+    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 7de15d0..4736f80 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,1576 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if(SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
+    ({                                                                                                                                               \
+        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
+    })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+    ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                         ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                        )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    // Note for the REINTERPRET_INPUT_AS_3D case
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+    zin0 *= (cross_plane_pad * src_stride_y);
+#if M0 > 1
+    zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+    zin1 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+    zin2 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+    zin3 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+    zin4 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+    zin5 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+    zin6 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
+    zin7 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 7
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    // Load values from the LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+    BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+    BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+    BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+    BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+    BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+    BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+    BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+
+    // ---------------------------Store output values ------------------------------
+
+    VSTORE(K0)
+    (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if M0 > 1
+    VSTORE(K0)
+    (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(K0)
+    (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(K0)
+    (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(K0)
+    (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(K0)
+    (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(K0)
+    (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(K0)
+    (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 7
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
+    ({                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
+        DATA_TYPE res1 = a4.s##i;                                                                     \
+        VSTORE(4)                                                                                     \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+    })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(2)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(3)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
+    ({                                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
+        VSTORE(M0)                                                                                                    \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
+    })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                        ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                       )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    // Note for the REINTERPRET_INPUT_AS_3D case
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+    zin0 *= (cross_plane_pad * src_stride_y);
+#if M0 > 1
+    zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+    zin1 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+    zin2 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+    zin3 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+    zin4 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+    zin5 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+    zin6 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+    zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
+    zin7 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 7
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    // Load values from the LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+    BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+    BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+    BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+    BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+    BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+    BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+    BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+
+    // ---------------------------Transpose and store block -----------------------
+
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (i.e. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (i.e. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
+                                     x / (uint)H0)
+                                 * (uint)dst_stride_y)
+                                 + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#endif // K0 > 1
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Store output values ------------------------------
+    VSTORE(N0)
+    (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if K0 > 1
+    VSTORE(N0)
+    (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 1
+#if K0 > 2
+    VSTORE(N0)
+    (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 2
+#if K0 > 3
+    VSTORE(N0)
+    (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 3
+#if K0 > 4
+    VSTORE(N0)
+    (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if K0 > 8
+    VSTORE(N0)
+    (a8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (a9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(N0)
+    (aF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (i.e. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (i.e. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
+                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Transpose the block ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+    // This part computes the following transpositions:
+    // 2x2 -> 2x2
+    // 2x4 -> 4x2
+    // 2x8 -> 8x2
+    // 2x16 -> 16x2
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+    // This part computes the following transpositions:
+    // 3x2 -> 2x3
+    // 3x4 -> 4x3
+    // 3x8 -> 8x3
+    // 3x16 -> 16x3
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+    // This part computes the following transpositions:
+    // 4x2 -> 2x4
+    // 4x4 -> 4x4
+    // 4x8 -> 8x4
+    // 4x16 -> 16x4
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+    // This part computes the following transpositions:
+    // 8x2 -> 2x8
+    // 8x4 -> 4x8
+    // 8x8 -> 8x8
+    // 8x16 -> 16x8
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+    // This part computes the following transpositions:
+    // 16x2 -> 2x16
+    // 16x4 -> 4x16
+    // 16x8 -> 8x16
+    // 16x16 -> 16x16
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+    // ---------------------------Store the output values ------------------------------
+
+    VSTORE(K0)
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+    VSTORE(K0)
+    (res2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+    VSTORE(K0)
+    (res3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+    VSTORE(K0)
+    (res4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (res5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (res6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (res7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+    VSTORE(K0)
+    (res8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (res9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+    VSTORE(K0)
+    (resF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+                                            IMAGE_DECLARATION(dst),
+                                            uint k,
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 2,4 - 2,8 - 2,16
+        // 3,4 - 3,8 - 3,16
+        // 4,4 - 4,8 - 4,16
+        // 5,4 - 5,8 - 5,16
+        // 6,4 - 6,8 - 6,16
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 0 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 1 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 2 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 3 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 4 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 5 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 6 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 7 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 7
+
+        // Load values from RHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zout0 = (0 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+    zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+    zout1 = (1 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+    zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zout2 = (2 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+    zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zout3 = (3 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+    zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zout4 = (4 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+    zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zout5 = (5 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+    zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zout6 = (6 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+    zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zout7 = (7 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+    c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+    c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+    c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+    c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+    c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+    c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+    c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+    // Store output block
+    VSTORE(N0)
+    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+    VSTORE(N0)
+    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(N0)
+    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(N0)
+    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(N0)
+    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(N0)
+    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(N0)
+    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(N0)
+    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
 
 #if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
 
@@ -193,7 +1763,7 @@
     vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
     vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
     vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#else // defined(UNROLL_BLOCK)
+#else  // defined(UNROLL_BLOCK)
     VEC_DATA_TYPE(DATA_TYPE, 4)
     val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
     vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
@@ -214,6 +1784,8 @@
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -226,6 +1798,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -238,6 +1812,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -251,6 +1829,9 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                 VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                  IMAGE_DECLARATION(dst),
                                                  uint src0_stride_z,
                                                  uint src1_stride_z,
@@ -340,6 +1921,16 @@
     c30 = c30 * (float4)ALPHA;
 #endif // defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    float4          c0        = vload4(0, src2_addr);
+
+    c00 += c0;
+    c10 += c0;
+    c20 += c0;
+    c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
@@ -389,7 +1980,9 @@
 }
 
 /** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
  *
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
@@ -404,6 +1997,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -416,6 +2011,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -429,6 +2028,9 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                         VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                          IMAGE_DECLARATION(dst),
                                                          uint src0_stride_z,
                                                          uint src1_stride_z,
@@ -653,6 +2255,28 @@
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
+#if defined(ADD_VEC_C)
+    __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    float4          c0        = vload4(0, src2_addr);
+
+    c00 += c0.s0;
+    c01 += c0.s1;
+    c02 += c0.s2;
+    c03 += c0.s3;
+    c10 += c0.s0;
+    c11 += c0.s1;
+    c12 += c0.s2;
+    c13 += c0.s3;
+    c20 += c0.s0;
+    c21 += c0.s1;
+    c22 += c0.s2;
+    c23 += c0.s3;
+    c30 += c0.s0;
+    c31 += c0.s1;
+    c32 += c0.s2;
+    c33 += c0.s3;
+#endif /* defined(ADD_VEC_C) */
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
     // in order to take into account the presence of possible cross plane paddings
@@ -705,6 +2329,8 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -717,6 +2343,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -729,6 +2357,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -742,6 +2374,9 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                 VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                  IMAGE_DECLARATION(dst),
                                                  uint src0_stride_z,
                                                  uint src1_stride_z,
@@ -831,6 +2466,20 @@
     c30 = c30 * (half8)ALPHA;
 #endif // defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    half8          c0        = vload8(0, src2_addr);
+    // clang-format on
+    // *INDENT-ON*
+
+    c00 += c0;
+    c10 += c0;
+    c20 += c0;
+    c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
@@ -882,6 +2531,8 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -894,6 +2545,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -906,6 +2559,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -919,6 +2576,9 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
                                                        IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                       VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                        IMAGE_DECLARATION(dst),
                                                        uint src0_stride_z,
                                                        uint src1_stride_z,
@@ -1008,6 +2668,20 @@
     c30 = c30 * (float8)ALPHA;
 #endif // defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    float8         c0        = convert_float8(vload8(0, src2_addr));
+    // clang-format on
+    // *INDENT-ON*
+
+    c00 += c0;
+    c10 += c0;
+    c20 += c0;
+    c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
@@ -1059,6 +2733,8 @@
 /** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -1071,6 +2747,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1083,6 +2761,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1093,6 +2775,9 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                         VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                          IMAGE_DECLARATION(dst),
                                                          uint src0_stride_z,
                                                          uint src1_stride_z,
@@ -1264,6 +2949,20 @@
     c30 = c30 * (half8)ALPHA;
 #endif // defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    half8          c0        = vload8(0, src2_addr);
+    // clang-format on
+    // *INDENT-ON*
+
+    c00 += c0;
+    c10 += c0;
+    c20 += c0;
+    c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
@@ -1322,7 +3021,9 @@
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
 #define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
  *
  * @note This OpenCL kernel works with floating point data types (F16/F32)
  * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -1338,6 +3039,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1350,6 +3053,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1364,6 +3071,9 @@
  */
 __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
                                      IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                     VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                      IMAGE_DECLARATION(dst),
                                      uint src0_stride_z,
                                      uint src1_stride_z,
@@ -1564,6 +3274,26 @@
     acc3 = acc3 * (VECTOR_TYPE)ALPHA;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global DATA_TYPE *src2_addr = (__global DATA_TYPE *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    VECTOR_TYPE         c0        = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src2_addr);
+    // clang-format on
+    // *INDENT-ON*
+
+    acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
     int z = get_global_id(2);
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
@@ -1634,6 +3364,8 @@
 
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -1649,6 +3381,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1661,6 +3395,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1675,6 +3413,9 @@
  */
 __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                 VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                  IMAGE_DECLARATION(dst),
                                                  uint src0_stride_z,
                                                  uint src1_stride_z,
@@ -2029,6 +3770,34 @@
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
+#if defined(ADD_VEC_C)
+    __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    float4          c0        = vload4(0, src2_addr);
+
+    acc00 += c0.s0;
+    acc01 += c0.s1;
+    acc02 += c0.s2;
+    acc03 += c0.s3;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc10 += c0.s0;
+    acc11 += c0.s1;
+    acc12 += c0.s2;
+    acc13 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc20 += c0.s0;
+    acc21 += c0.s1;
+    acc22 += c0.s2;
+    acc23 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc30 += c0.s0;
+    acc31 += c0.s1;
+    acc32 += c0.s2;
+    acc33 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
     // in order to take into account the presence of possible cross plane paddings
@@ -2088,6 +3857,8 @@
 
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
  * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
@@ -2104,6 +3875,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2116,6 +3889,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2130,6 +3907,9 @@
  */
 __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
                                                       IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                      VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                       IMAGE_DECLARATION(dst),
                                                       uint src0_stride_z,
                                                       uint src1_stride_z,
@@ -2416,6 +4196,26 @@
     // Compute dst address
     __global uchar *dst_addr = offset(&dst, 0, 0);
 
+#if defined(ADD_VEC_C)
+    __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    float2          c0        = vload2(0, src2_addr);
+
+    acc00 += c0.s0;
+    acc01 += c0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc10 += c0.s0;
+    acc11 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc20 += c0.s0;
+    acc21 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc30 += c0.s0;
+    acc31 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
     // in order to take into account the presence of possible cross plane paddings
@@ -2476,6 +4276,8 @@
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -2491,6 +4293,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2503,6 +4307,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2517,6 +4325,9 @@
  */
 __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
                                                        IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                       VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                        IMAGE_DECLARATION(dst),
                                                        uint src0_stride_z,
                                                        uint src1_stride_z,
@@ -2757,6 +4568,26 @@
 #endif // defined(ALPHA)
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    half8          c0        = vload8(0, src2_addr);
+    // clang-format on
+    // *INDENT-ON*
+
+    hacc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    hacc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    hacc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    hacc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
     int z = get_global_id(2);
 
     // Compute destination address
@@ -2824,6 +4655,8 @@
 
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
  * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -2839,6 +4672,8 @@
  *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  *
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2851,6 +4686,10 @@
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2865,6 +4704,9 @@
  */
 __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+                                                 VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
                                                  IMAGE_DECLARATION(dst),
                                                  uint src0_stride_z,
                                                  uint src1_stride_z,
@@ -3089,6 +4931,26 @@
     acc3 = acc3 * (half8)ALPHA;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
 
+#if defined(ADD_VEC_C)
+    // *INDENT-OFF*
+    // clang-format off
+    __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+    half8          c0        = vload8(0, src2_addr);
+    // clang-format on
+    // *INDENT-ON*
+
+    acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
     int z = get_global_id(2);
 
     // Compute destination address

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 8c1fa54..277338b 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 #include "helpers_asymm.h"
+#include "repeat.h"
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
@@ -1943,6 +1944,574 @@
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
 
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)                                         \
+    ({                                                              \
+        ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)                                       \
+    ({                                                            \
+        ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        ARM_DOT(a, b, c);   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)           \
+    ({                                \
+        ARM_DOT(a.s0123, b.s0123, c); \
+        ARM_DOT(a.s4567, b.s4567, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)           \
+    ({                                \
+        ARM_DOT(a.s0123, b.s0123, c); \
+        ARM_DOT(a.s4567, b.s4567, c); \
+        ARM_DOT(a.s89AB, b.s89AB, c); \
+        ARM_DOT(a.sCDEF, b.sCDEF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+        c += (uint)a.s1 * b.s1; \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+        c += (uint)a.s1 * b.s1; \
+        c += (uint)a.s2 * b.s2; \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+        c += (uint)a.s1 * b.s1; \
+        c += (uint)a.s2 * b.s2; \
+        c += (uint)a.s3 * b.s3; \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+        c += (uint)a.s1 * b.s1; \
+        c += (uint)a.s2 * b.s2; \
+        c += (uint)a.s3 * b.s3; \
+        c += (uint)a.s4 * b.s4; \
+        c += (uint)a.s5 * b.s5; \
+        c += (uint)a.s6 * b.s6; \
+        c += (uint)a.s7 * b.s7; \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c += (uint)a.s0 * b.s0; \
+        c += (uint)a.s1 * b.s1; \
+        c += (uint)a.s2 * b.s2; \
+        c += (uint)a.s3 * b.s3; \
+        c += (uint)a.s4 * b.s4; \
+        c += (uint)a.s5 * b.s5; \
+        c += (uint)a.s6 * b.s6; \
+        c += (uint)a.s7 * b.s7; \
+        c += (uint)a.s8 * b.s8; \
+        c += (uint)a.s9 * b.s9; \
+        c += (uint)a.sA * b.sA; \
+        c += (uint)a.sB * b.sB; \
+        c += (uint)a.sC * b.sC; \
+        c += (uint)a.sD * b.sD; \
+        c += (uint)a.sE * b.sE; \
+        c += (uint)a.sF * b.sF; \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0
+
+#endif //defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                                IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst),
+                                                uint k,
+                                                uint lhs_stride_z,
+                                                uint rhs_stride_z,
+                                                uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                               )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X + (get_global_id(1) / V0) * (uint)lhs_stride_y + (get_global_id(
+                                   2)
+                               * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 2,4 - 2,8 - 2,16
+        // 3,4 - 3,8 - 3,16
+        // 4,4 - 4,8 - 4,16
+        // 5,4 - 5,8 - 5,16
+        // 6,4 - 6,8 - 6,16
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(uchar, K0)
+        a0 = VLOAD(K0)(0, lhs_addr + 0 * LHS_STEP_X);
+#if M0 > 1
+        VEC_DATA_TYPE(uchar, K0)
+        a1 = VLOAD(K0)(0, lhs_addr + 1 * LHS_STEP_X);
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(uchar, K0)
+        a2 = VLOAD(K0)(0, lhs_addr + 2 * LHS_STEP_X);
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(uchar, K0)
+        a3 = VLOAD(K0)(0, lhs_addr + 3 * LHS_STEP_X);
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(uchar, K0)
+        a4 = VLOAD(K0)(0, lhs_addr + 4 * LHS_STEP_X);
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(uchar, K0)
+        a5 = VLOAD(K0)(0, lhs_addr + 5 * LHS_STEP_X);
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(uchar, K0)
+        a6 = VLOAD(K0)(0, lhs_addr + 6 * LHS_STEP_X);
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(uchar, K0)
+        a7 = VLOAD(K0)(0, lhs_addr + 7 * LHS_STEP_X);
+#endif // M0 > 7
+
+        // Load values from RHS matrix
+        VEC_DATA_TYPE(uchar, K0)
+        b0 = VLOAD(K0)(0, rhs_addr + 0 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b1 = VLOAD(K0)(0, rhs_addr + 1 * RHS_STEP_X);
+#if N0 > 2
+        VEC_DATA_TYPE(uchar, K0)
+        b2 = VLOAD(K0)(0, rhs_addr + 2 * RHS_STEP_X);
+#endif // N0 > 2
+#if N0 > 3
+        VEC_DATA_TYPE(uchar, K0)
+        b3 = VLOAD(K0)(0, rhs_addr + 3 * RHS_STEP_X);
+#endif // N0 > 3
+#if N0 > 4
+        VEC_DATA_TYPE(uchar, K0)
+        b4 = VLOAD(K0)(0, rhs_addr + 4 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b5 = VLOAD(K0)(0, rhs_addr + 5 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b6 = VLOAD(K0)(0, rhs_addr + 6 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b7 = VLOAD(K0)(0, rhs_addr + 7 * RHS_STEP_X);
+#endif // N0 > 4
+#if N0 > 8
+        VEC_DATA_TYPE(uchar, K0)
+        b8 = VLOAD(K0)(0, rhs_addr + 8 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        b9 = VLOAD(K0)(0, rhs_addr + 9 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bA = VLOAD(K0)(0, rhs_addr + 10 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bB = VLOAD(K0)(0, rhs_addr + 11 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bC = VLOAD(K0)(0, rhs_addr + 12 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bD = VLOAD(K0)(0, rhs_addr + 13 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bE = VLOAD(K0)(0, rhs_addr + 14 * RHS_STEP_X);
+        VEC_DATA_TYPE(uchar, K0)
+        bF = VLOAD(K0)(0, rhs_addr + 15 * RHS_STEP_X);
+#endif // N0 > 8
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(int)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    zout0 = (0 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+    zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+    zout1 = (1 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+    zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+    zout2 = (2 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+    zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+    zout3 = (3 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+    zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+    zout4 = (4 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+    zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+    zout5 = (5 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+    zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+    zout6 = (6 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+    zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+    zout7 = (7 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+    zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Store output block
+    VSTORE(N0)
+    (CONVERT_SAT(c0, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+    VSTORE(N0)
+    (CONVERT_SAT(c1, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+    VSTORE(N0)
+    (CONVERT_SAT(c2, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+    VSTORE(N0)
+    (CONVERT_SAT(c3, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+    VSTORE(N0)
+    (CONVERT_SAT(c4, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+    VSTORE(N0)
+    (CONVERT_SAT(c5, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+    VSTORE(N0)
+    (CONVERT_SAT(c6, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+    VSTORE(N0)
+    (CONVERT_SAT(c7, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices unsing the dot8 instruction.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8(IMAGE_DECLARATION(lhs),
+                                                     IMAGE_DECLARATION(rhs),
+                                                     IMAGE_DECLARATION(dst),
+                                                     uint k,
+                                                     uint lhs_stride_z,
+                                                     uint rhs_stride_z,
+                                                     uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                     ,
+                                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                    )
+{
+    // Note: ARM_DOT_K0XN0 is generated with the dot8 instruction
+    gemmlowp_mm_reshaped_lhs_nt_rhs_t(lhs_ptr,
+                                      lhs_stride_x,
+                                      lhs_step_x,
+                                      lhs_stride_y,
+                                      lhs_step_y,
+                                      lhs_offset_first_element_in_bytes,
+                                      rhs_ptr,
+                                      rhs_stride_x,
+                                      rhs_step_x,
+                                      rhs_stride_y,
+                                      rhs_step_y,
+                                      rhs_offset_first_element_in_bytes,
+                                      dst_ptr,
+                                      dst_stride_x,
+                                      dst_step_x,
+                                      dst_stride_y,
+                                      dst_step_y,
+                                      dst_offset_first_element_in_bytes,
+                                      k,
+                                      lhs_stride_z,
+                                      rhs_stride_z,
+                                      dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                      ,
+                                      dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                     );
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
 #if defined(COLS_A)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
  *

diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
index bc6f4b5..a947dad 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/generate_proposals.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 7ee97d9..180bd50 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -50,6 +50,9 @@
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
+#define float1 float
+#define half1 half
+
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 

diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 186d5a8..2bf59e4 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl

@@ -1029,6 +1029,177 @@
 #endif // HAS_BIAS
 }
 
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                                                                                       \
+    ({                                                                                                                                                                     \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                                                                                     \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                                                                                            \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                                                                                             \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                                                                                             \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));                                                                          \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));                                                                          \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));                                                                          \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));                                                                          \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));                                                                          \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));                                                                          \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));                                                                          \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));                                                                          \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));                                                                             \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                                                                              \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s0)); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s1)); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s2)); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s3)); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s4)); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s5)); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s6)); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s7)); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond1));    \
+        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);                                                                                        \
+        VSTORE(VECTOR_SIZE)                                                                                                                                                \
+        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);                                                                                        \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                              \
+    ({                                                                                            \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                            \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                   \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                    \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                    \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));    \
+        \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);               \
+        VSTORE(VECTOR_SIZE)                                                                       \
+        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);               \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+    const int yo    = get_global_id(1);
+    const int batch = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
 /** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 0b6df39..390f8fc 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl

@@ -32,6 +32,7 @@
 #define LOAD_OP(offset, ptr) vload4(offset, ptr)
 #define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
 
+#if defined(NUM_SLICES)
 /** Apply cross-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -91,9 +92,10 @@
 
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
+#endif /* defined(NUM_SLICES) */
 
 #if defined(WIDTH_SIZE)
-/** Apply in-map normalization.
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -117,8 +119,8 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
-                                         TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
 {
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -170,3 +172,83 @@
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 #endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_cols = get_global_id(1);
+    const int first_col    = max(-(int)RADIUS, -current_cols);
+    const int last_col     = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols);
+
+#if defined(IN_MAP_2D)
+    const int current_rows = get_global_id(2);
+    const int first_row    = max(-(int)RADIUS, -current_rows);
+    const int last_row     = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */

diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 03fc15e..77f03f7 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_IN)
-/** Perform a DCHW -> DHWC permute operation on an input tensor.
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -48,81 +49,26 @@
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void permute_201(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
 }
-
-/** Perform a DCHW -> DWCH permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_120(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
-}
-
-/** Perform a DCHW -> HWCD permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_3201(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN)
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)

diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
index be072ec..046151b 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl

@@ -104,88 +104,6 @@
 
     return idx;
 }
-
-/** Compute prior boxes and clip (NHWC)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx                                  Index to write to
- * @param[in]  center_x                             Center value of the x axis
- * @param[in]  center_y                             Center value of the y axis
- * @param[in]  box_width                            Prior box width
- * @param[in]  box_height                           Prior box height
- *
- */
-inline void calculate_xy_min_max_nhwc(Tensor3D *out, int idx, float center_x, float center_y, float box_width, float box_height)
-{
-    float xmin = (center_x - box_width / 2.f) / WIDTH;
-    float ymin = (center_y - box_height / 2.f) / HEIGHT;
-    float xmax = (center_x + box_width / 2.f) / WIDTH;
-    float ymax = (center_y + box_height / 2.f) / HEIGHT;
-
-#if defined(CLIP)
-    xmin = clamp(xmin, 0.f, 1.f);
-    ymin = clamp(ymin, 0.f, 1.f);
-    xmax = clamp(xmax, 0.f, 1.f);
-    ymax = clamp(ymax, 0.f, 1.f);
-#endif // defined(CLIP)
-
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 0, 0)) = xmin;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 1, 0)) = ymin;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 2, 0)) = xmax;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 3, 0)) = ymax;
-}
-
-/** Compute prior boxes (NHWC)
- *
- * @param[in,out] out                Tensor output
- * @param[in]     max                The maximum values
- * @param[in]     aspect_ratios      The aspect ratio values
- * @param[in]     max_size           The maximum values values size
- * @param[in]     aspect_ratios_size The aspect ratio values size
- * @param[in]     min_size           The minimum values size
- * @param[in]     min_idx            Index of the min vector
- * @param[in]     idx                Index to write to
- *
- * @return The updated index
- */
-inline int calculate_min_nhwc(Tensor3D *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
-{
-    const float center_x = ((float)(get_global_id(1) % LAYER_WIDTH) + OFFSET) * STEP_X;
-    const float center_y = ((float)(get_global_id(1) / LAYER_WIDTH) + OFFSET) * STEP_Y;
-
-    float box_width  = min_size;
-    float box_height = min_size;
-
-    calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-    idx += 4;
-    if(max_size > 0)
-    {
-        box_width  = sqrt(min_size * max[min_idx]);
-        box_height = box_width;
-        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
-    {
-        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
-        {
-            continue;
-        }
-        box_width  = min_size * sqrt(aspect_ratios[i]);
-        box_height = min_size * rsqrt(aspect_ratios[i]);
-
-        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-
-    return idx;
-}
-
 /** Calculate prior boxes with NCHW format.
  *
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
@@ -218,39 +136,4 @@
         vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
     }
 }
-
-/** Calculate prior boxes with NHWC format.
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min                                  The minimum values
- * @param[in]  max                                  The maximum_values
- * @param[in]  aspect_ratios                        The aspect ratio values
- * @param[in]  min_size                             The minimum values size
- * @param[in]  max_size                             The maximum_values values size
- * @param[in]  aspect_ratios_size                   The aspect ratio values size
- */
-__kernel void prior_box_layer_nhwc(TENSOR3D_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
-                                   unsigned int aspect_ratios_size)
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int idx = 0;
-    for(unsigned int i = 0; i < min_size; ++i)
-    {
-        idx = calculate_min_nhwc(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
-    }
-
-    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
-    {
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 0, 1)) = VARIANCE_0;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 1, 1)) = VARIANCE_1;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 2, 1)) = VARIANCE_2;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 3, 1)) = VARIANCE_3;
-    }
-}
 #endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */

diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/range.cl
new file mode 100644
index 0000000..d122c9a
--- /dev/null
+++ b/src/core/CL/cl_kernels/range.cl

@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range(
+    VECTOR_DECLARATION(out))
+{
+    uint           id      = get_global_id(0) * VECTOR_SIZE;
+    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    DATA_TYPE seq;
+    seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
+
+    *((__global DATA_TYPE *)dst_ptr) = seq;
+#else // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    seq;
+
+    seq.s0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+#if VECTOR_SIZE > 1
+    seq.s1 = seq.s0 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 2
+    seq.s2 = seq.s1 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 3
+    seq.s3 = seq.s2 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 4
+    seq.s4 = seq.s3 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 5
+    seq.s5 = seq.s4 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 6
+    seq.s6 = seq.s5 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 7
+    seq.s7 = seq.s6 + (DATA_TYPE)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+    VSTORE(VECTOR_SIZE)
+    (seq, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif //VECTOR_SIZE == 1
+}
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range_quantized(
+    VECTOR_DECLARATION(out))
+{
+    size_t         id      = get_global_id(0) * VECTOR_SIZE;
+    __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    float seq;
+    seq                          = (float)START + (float)id * (float)STEP;
+    seq                          = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+    seq                          = max(0.0f, min(seq, 255.0f));
+    *((__global uchar *)dst_ptr) = CONVERT_SAT(CONVERT_DOWN(seq, int), uchar);
+#else // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(float, VECTOR_SIZE)
+    seq;
+    seq.s0 = (float)START + id * (float)STEP;
+#if VECTOR_SIZE > 1
+    seq.s1 = seq.s0 + (float)STEP;
+#if VECTOR_SIZE > 2
+    seq.s2 = seq.s1 + (float)STEP;
+#if VECTOR_SIZE > 3
+    seq.s3 = seq.s2 + (float)STEP;
+#if VECTOR_SIZE > 4
+    seq.s4 = seq.s3 + (float)STEP;
+#if VECTOR_SIZE > 5
+    seq.s5 = seq.s4 + (float)STEP;
+#if VECTOR_SIZE > 6
+    seq.s6 = seq.s5 + (float)STEP;
+#if VECTOR_SIZE > 7
+    seq.s7 = seq.s6 + (float)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+    seq    = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+    seq    = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+    VEC_DATA_TYPE(uchar, VECTOR_SIZE)
+    res = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(uchar, VECTOR_SIZE));
+    VSTORE(VECTOR_SIZE)
+    (res, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif // VECTOR_SIZE == 1
+}
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d76e12a..b4ede25 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,11 +61,30 @@
     return (in.s0 + in.s1);
 }
 
+/** Calculate product of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return product of vector.
+ */
+inline DATA_TYPE product(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in.s01234567 *= in.s89ABCDEF;
+    in.s0123 *= in.s4567;
+    in.s01 *= in.s23;
+
+    return (in.s0 * in.s1);
+}
+#if defined(OPERATION)
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
  * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
  *
  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F16/F32
@@ -74,28 +93,28 @@
  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_sum_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_sum_step_y                        partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums                                Local buffer for storing the partial sum
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: same as @p src_ptr
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
  */
 __kernel void reduction_operation_x(
     IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(partial_sum),
-    __local DATA_TYPE *local_sums)
+    IMAGE_DECLARATION(partial_res),
+    __local DATA_TYPE *local_results)
 {
     Image src         = CONVERT_TO_IMAGE_STRUCT(src);
-    Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
+    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
 
     unsigned int lsize = get_local_size(0);
     unsigned int lid   = get_local_id(0);
 
     for(unsigned int y = 0; y < get_local_size(1); ++y)
     {
-        local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Perform parallel reduction
@@ -103,7 +122,11 @@
         {
             if(lid < i)
             {
-                local_sums[lid] += local_sums[lid + i];
+#if defined(PROD)
+                local_results[lid] *= local_results[lid + i];
+#else  //!defined(PROD)
+                local_results[lid] += local_results[lid + i];
+#endif //defined(PROD)
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -113,20 +136,24 @@
 #if defined(MEAN) && defined(WIDTH)
             if(y == get_local_size(1) - 1)
             {
-                local_sums[0] /= WIDTH;
+                local_results[0] /= WIDTH;
             }
 #endif /* defined(MEAN) && defined(WIDTH) */
-            ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+            ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
         }
     }
 }
+#endif // defined(OPERATION)
 
 #if defined(WIDTH)
-/** This kernel performs reduction on x-axis. (QASYMM8)
+/** This kernel performs reduction on x-axis. (Non parallel)
  *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
@@ -135,33 +162,49 @@
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
  */
-__kernel void reduction_operation_quantized_x(
+__kernel void reduction_operation_non_parallel_x(
     VECTOR_DECLARATION(src),
     VECTOR_DECLARATION(output))
 {
     Vector src    = CONVERT_TO_VECTOR_STRUCT(src);
     Vector output = CONVERT_TO_VECTOR_STRUCT(output);
 
-    uint res = 0;
+    DATA_TYPE_PROMOTED res = *((__global DATA_TYPE *)vector_offset(&src, 0));
 
-    for(unsigned int x = 0; x < WIDTH; ++x)
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    uint indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    for(unsigned int x = 1; x < WIDTH; ++x)
     {
-        res += *((__global uchar *)vector_offset(&src, x));
+        DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
+#if defined(ARG_MAX)
+        indx = select(indx, x, isgreater(in, res));
+        res  = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
+#elif defined(ARG_MIN)
+        indx = select(indx, x, isless(in, res));
+        res  = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
+#else  // !(defined(ARG_MAX) || defined(ARG_MIN))
+        res += in;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
+    // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    *((__global uint *)output.ptr) = indx;
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(MEAN)
     res /= WIDTH;
-#endif /* defined(MEAN) */
-
-    // Store result
+#endif // defined(MEAN)
     *((__global uchar *)output.ptr) = convert_uchar(res);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
-#endif /* defined(HEIGHT) */
+#endif /* defined(WIDTH) */
 
 #if defined(HEIGHT)
 /** This kernel performs reduction on y-axis.
  *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
  * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
@@ -185,24 +228,49 @@
     Image output = CONVERT_TO_IMAGE_STRUCT(output);
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = 0;
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
-    for(unsigned int y = 0; y < HEIGHT; ++y)
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    for(unsigned int y = 1; y < HEIGHT; ++y)
     {
         VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
         in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(ARG_MAX)
+        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        indx             = select(indx, y, cond_conv);
+        res              = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+        uint16  cond_conv           = CONVERT(isless(in, res), uint16);
+        indx                        = select(indx, y, cond_conv);
+        res                         = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
+    // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(MEAN)
     res /= HEIGHT;
-#endif /* defined(MEAN) */
-
-    // Store result
+#endif // defined(MEAN)
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
 #endif /* defined(HEIGHT) */
 
@@ -237,24 +305,50 @@
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = 0;
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
-    for(unsigned int z = 0; z < DEPTH; ++z)
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    for(unsigned int z = 1; z < DEPTH; ++z)
     {
         VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
         in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(ARG_MAX)
+        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        indx             = select(indx, z, cond_conv);
+        res              = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+        uint16 cond_conv = CONVERT(isless(in, res), uint16);
+        indx             = select(indx, z, cond_conv);
+        res              = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
+    // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(MEAN)
     res /= DEPTH;
-#endif /* defined(MEAN) */
-
-    // Store result
+#endif // defined(MEAN)
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
 #endif /* defined(DEPTH) */
 
@@ -294,23 +388,49 @@
     Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
 
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
-    res = 0;
+    res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
-    for(unsigned int w = 0; w < BATCH; ++w)
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+    for(unsigned int w = 1; w < BATCH; ++w)
     {
         VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
         in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(ARG_MAX)
+        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        indx             = select(indx, w, cond_conv);
+        res              = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+        uint16 cond_conv = CONVERT(isless(in, res), uint16);
+        indx             = select(indx, w, cond_conv);
+        res              = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
+    // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+    vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(MEAN)
     res /= BATCH;
-#endif /* defined(MEAN) */
-
-    // Store result
+#endif // defined(MEAN)
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
-#endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file
+#endif /* defined(BATCH) && defined(DEPTH) */

diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 0000000..691f7ae
--- /dev/null
+++ b/src/core/CL/cl_kernels/repeat.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+/** Macros that help in loop unrolling */
+//Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(1, P_A, P_B, P_C);       \
+    REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(2, P_A, P_B, P_C);       \
+    REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(3, P_A, P_B, P_C);       \
+    REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(4, P_A, P_B, P_C);       \
+    REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(5, P_A, P_B, P_C);       \
+    REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(6, P_A, P_B, P_C);       \
+    REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(7, P_A, P_B, P_C);       \
+    REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(8, P_A, P_B, P_C);       \
+    REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(9, P_A, P_B, P_C);        \
+    REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(A, P_A, P_B, P_C);        \
+    REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(B, P_A, P_B, P_C);        \
+    REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(C, P_A, P_B, P_C);        \
+    REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(D, P_A, P_B, P_C);        \
+    REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(E, P_A, P_B, P_C);        \
+    REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+    P_X##_DEF(F, P_A, P_B, P_C);        \
+    REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+#endif // ARM_COMPUTE_REPEAT_H

diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
new file mode 100644
index 0000000..6afd382
--- /dev/null
+++ b/src/core/CL/cl_kernels/reverse.cl

@@ -0,0 +1,102 @@
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
+ * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+                      VECTOR_DECLARATION(axis),
+                      TENSOR4D_DECLARATION(dst),
+                      const uint width,
+                      const uint height,
+                      const uint depth,
+                      const uint batches)
+{
+    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
+
+    const uint x_in = get_global_id(0);
+    const uint y_in = get_global_id(1);
+    const uint z_in = get_global_id(2) % depth;
+    const uint w_in = get_global_id(2) / depth;
+
+    const uint4 dims       = (uint4)(0, 1, 2, 3);
+    int4        to_reverse = (int4)(0, 0, 0, 0);
+#if NUM_REVERSE_DIMS == 1
+    const uint index = *((__global uint *)axis.ptr);
+    to_reverse       = (uint4)index == dims;
+#elif NUM_REVERSE_DIMS == 2
+    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
+    const uint index2     = *((__global uint *)axis.ptr + 2);
+    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
+#else  /* NUM_REVERSE_DIMS == 3 */
+    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)

diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
index f52eb18..430369b 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,11 +75,17 @@
             const float w2 = hy * lx;
             const float w3 = ly * hx;
             const float w4 = ly * lx;
-
-            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
-            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
-            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
-            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
             sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
         }
     }
@@ -133,9 +139,15 @@
     Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
 
-    const int px = get_global_id(0);
-    const int py = get_global_id(1);
-    const int pw = get_global_id(2);
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
 
     // Load roi parameters
     // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
@@ -161,7 +173,7 @@
     const float2 roi_bin_grid = SAMPLING_RATIO;
 #else  // !defined(SAMPLING_RATIO)
     // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
+    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
 #endif // defined(SAMPLING_RATIO)
 
     // Move input and output pointer across the fourth dimension
@@ -169,15 +181,20 @@
     output.ptr += pw * output_stride_w;
     for(int pz = 0; pz < MAX_DIM_Z; ++pz)
     {
-        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_align_1x1(&input,
-                                                                                                        region_start.x,
-                                                                                                        bin_size.x,
-                                                                                                        roi_bin_grid.x,
-                                                                                                        region_end.x,
-                                                                                                        region_start.y,
-                                                                                                        bin_size.y,
-                                                                                                        roi_bin_grid.y,
-                                                                                                        region_end.y, pz);
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
     }
 }
 #endif // Check for compile time constants

diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index 042b102..0cf296c 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,10 +105,12 @@
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
- * @param[in]  rois_stride_x                        Stride of the rois array in X dimension (in bytes)
- * @param[in]  rois_step_x                          rois_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the rois array
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -122,13 +124,13 @@
  */
 __kernel void roi_pooling_layer(
     TENSOR3D_DECLARATION(input),
-    VECTOR_DECLARATION(rois),
+    IMAGE_DECLARATION(rois),
     TENSOR3D_DECLARATION(output),
     unsigned int input_stride_w, unsigned int output_stride_w)
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Vector   rois   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
 
     const int px = get_global_id(0);
@@ -136,12 +138,12 @@
     const int pw = get_global_id(2);
 
     // Load roi parameters
-    // roi is laid out as follows:
-    // { x, y, width, height, batch_index }
-    const ushort4 roi      = vload4(0, (__global ushort *)vector_offset(&rois, pw));
-    const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
-    const int2 roi_anchor  = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
-    const int2 roi_dims    = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi               = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
 
     // Calculate pooled region start and end
     const float2 spatial_indx     = (float2)(px, py);

diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 744f28a..5ac6443 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl

@@ -134,9 +134,11 @@
     vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
 }
 
+#if defined(DEPTH_OUT)
 /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
  *
  * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -160,28 +162,29 @@
  * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
     const float input_width,
     const float input_height,
     const float scale_x,
     const float scale_y)
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
     const float new_x     = (get_global_id(1) + 0.5f) * scale_x;
-    const float new_y     = (get_global_id(2) + 0.5f) * scale_y;
+    const float new_y     = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
     const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
     const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y)));
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
 }
 
 /** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
  *
  * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -205,22 +208,22 @@
  * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_bilinear_nhwc(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
     const float input_width,
     const float input_height,
     const float scale_x,
     const float scale_y)
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
     const float new_x = get_global_id(1) * scale_x;
-    const float new_y = get_global_id(2) * scale_y;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
 #elif SAMPLING_POLICY_CENTER
     const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
@@ -241,10 +244,10 @@
     clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
 #endif /* BORDER_MODE_REPLICATE */
 
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
-                          *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
-                          *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
-                          *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
 
     const float a  = new_x - new_xf;
     const float b  = 1.f - a;
@@ -254,3 +257,4 @@
 
     *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
 }
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
index 3211e7e..86dbf60 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/scale_quantized.cl

@@ -85,12 +85,14 @@
     vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
 }
 
+#if defined(DEPTH_OUT)
 /** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
  *
  * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
  * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
  * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -114,22 +116,22 @@
  * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_bilinear_quantized_nhwc(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
     const float input_width,
     const float input_height,
     const float scale_x,
     const float scale_y)
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
     const float new_x = get_global_id(1) * scale_x;
-    const float new_y = get_global_id(2) * scale_y;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
 #elif SAMPLING_POLICY_CENTER
     const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
@@ -150,10 +152,10 @@
     clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
 #endif /* BORDER_MODE_REPLICATE */
 
-    int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
-                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
-                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
-                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+    int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
 
     const float  a      = new_x - new_xf;
     const float  b      = 1.f - a;
@@ -167,3 +169,4 @@
 
     *((__global DATA_TYPE *)out.ptr) = res;
 }
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
new file mode 100644
index 0000000..d783ae2
--- /dev/null
+++ b/src/core/CL/cl_kernels/select.cl

@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has the same rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  c_step_y                          c_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_same_rank(
+    TENSOR3D_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D c_t   = CONVERT_TO_TENSOR3D_STRUCT(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_2(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(1);
+
+    // Get pixels pointer
+    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) */
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_n(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(2) / DEPTH_SIZE;
+
+    // Get pixels pointer
+    Vector   c_t   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+    Tensor3D x_t   = CONVERT_TO_TENSOR3D_STRUCT(x);
+    Tensor3D y_t   = CONVERT_TO_TENSOR3D_STRUCT(y);
+    Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
index bc3df47..97decee 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/slice_ops.cl

@@ -64,7 +64,9 @@
     int offset = 0;
 
     // Offset X
-#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#if defined(SHRINK_0)
+    input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
     // Check if access on width gets out of bounds
     // If it does shift access vector to access elements within bounds
     const int xi = (int)(get_global_id(0) * VEC_SIZE);
@@ -77,20 +79,46 @@
 #endif // defined(START_0) && defined(STRIDE_0)
 
     // Offset Y
-#if defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_1)
+    input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else  // defined(SHRINK_0)
     offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
     input.ptr += offset * input_stride_y;
 #endif // defined(START_1) && defined(STRIDE_1)
 
     // Offset Z
-#if defined(START_2) && defined(STRIDE_2)
+#if defined(SHRINK_2)
+    input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else  // defined(SHRINK_1) && defined(SHRINK_0)
     offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
     input.ptr += offset * input_stride_z;
 #endif // defined(START_2) && defined(STRIDE_2)
 
     // Offset depth
-#if defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_3)
+    input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
     offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
     input.ptr += offset * input_stride_w;
 #endif // defined(START_3) && defined(STRIDE_3)
 

diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
index d42a79d..79343d4 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/space_to_batch.cl

@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-#if defined(BATCH_SIZE) && defined(DATA_TYPE)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
 /** Calculate the space to batch conversion.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -83,12 +83,15 @@
     const int out_y = get_global_id(1);
     const int z     = get_global_id(2);
 
-    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
@@ -151,18 +154,21 @@
     const int out_y = get_global_id(2);
     const int z     = get_global_id(0);
 
-    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
     }
 }
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
 
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
 /** Calculate the space to batch conversion.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -207,12 +213,15 @@
     const int out_y = get_global_id(1);
     const int z     = get_global_id(2);
 
-    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
@@ -260,13 +269,16 @@
     const int out_y = get_global_id(2);
     const int z     = get_global_id(0);
 
-    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
     }
 }
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)

diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
new file mode 100644
index 0000000..bed6266
--- /dev/null
+++ b/src/core/CL/cl_kernels/stack_layer.cl

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    unsigned int idx_input)
+{
+    uint x_src = get_global_id(0);
+    uint y_src = get_global_id(1);
+    uint z_src = (get_global_id(2) % SRC_DIM2);
+    uint w_src = (get_global_id(2) / SRC_DIM2);
+
+    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+                                                     dst_stride_w * (uint)DST_DIM3);
+
+    *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)

diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/tile.cl
new file mode 100644
index 0000000..ae625d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/tile.cl

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+
+    // For all coordinates but x, each tile copies from the input
+    const int y     = get_global_id(1);
+    const int z     = get_global_id(2) % DST_DEPTH;
+    const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+    // If we are loading/storing multiple elements at time, we need to
+    // not exceed the input boundaries. The last threads need to backtrack
+    // of OFFSET elements. Those elements cumulates for previous tiles
+    const int id = (int)(get_global_id(0));
+    int       x  = id * VEC_SIZE;
+
+    // Shift x based on the previous offsets
+    const int tile_number = x / SRC_WIDTH;
+    x -= (tile_number) * OFFSET;
+    int x_input = x % SRC_WIDTH;
+
+    // Shift x based on being the last tile
+    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
+    x -= last_tile * OFFSET;
+    x_input = x % SRC_WIDTH;
+    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    // Copy the data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(OFFSET)
+    const int x = get_global_id(0);
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)

diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index f52b027..e979978 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl

@@ -23,7 +23,15 @@
  */
 #include "helpers.h"
 
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
 #if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
 /** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -32,6 +40,10 @@
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -86,6 +98,7 @@
     float out00 = d00 + d01 + d02;
     float out01 = d01 - d02 - d03;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
     DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
     DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
     DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
@@ -150,10 +163,12 @@
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
+    const const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                            = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -162,11 +177,12 @@
     out10 += (DATA_TYPE)b;
     out11 += (DATA_TYPE)b;
 #endif // defined(HAS_BIAS)
-
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))((DATA_TYPE)out10, (DATA_TYPE)out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
+#if defined(VEC_SIZE) && VEC_SIZE == 4
 /** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -230,6 +246,7 @@
     float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
     float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
     DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
     DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
     DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
@@ -351,12 +368,14 @@
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = (DATA_TYPE)out02;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = (DATA_TYPE)out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                                = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out00, (DATA_TYPE)out01, (DATA_TYPE)out02, (DATA_TYPE)out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -377,9 +396,9 @@
     out32 += (float)b;
     out33 += (float)b;
 #endif // defined(HAS_BIAS)
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out10, (DATA_TYPE)out11, (DATA_TYPE)out12, (DATA_TYPE)out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out20, (DATA_TYPE)out21, (DATA_TYPE)out22, (DATA_TYPE)out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out30, (DATA_TYPE)out31, (DATA_TYPE)out32, (DATA_TYPE)out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -579,25 +598,29 @@
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 #if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else /* defined(SRC_DEPTH) */
+#else                                                                               /* defined(SRC_DEPTH) */
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif /* defined(SRC_DEPTH) */
+#endif                                                                              /* defined(SRC_DEPTH) */
     offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
     // Store the 1x4 output tile
-    *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = (DATA_TYPE)out00;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = (DATA_TYPE)out01;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = (DATA_TYPE)out02;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = (DATA_TYPE)out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                        = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
 #elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Store the 4x1 output tile
     int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
     int mult_y = min(dst_size - offset, 1);
 
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = (DATA_TYPE)out00;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = (DATA_TYPE)out01;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = (DATA_TYPE)out02;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = (DATA_TYPE)out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                                                 = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Get output address
 #if defined(SRC_DEPTH)
@@ -609,22 +632,30 @@
     int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
     // Store the 4x4 output tile
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = (DATA_TYPE)out00;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = (DATA_TYPE)out01;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = (DATA_TYPE)out02;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = (DATA_TYPE)out03;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = (DATA_TYPE)out10;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = (DATA_TYPE)out11;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = (DATA_TYPE)out12;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = (DATA_TYPE)out13;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = (DATA_TYPE)out20;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = (DATA_TYPE)out21;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = (DATA_TYPE)out22;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = (DATA_TYPE)out23;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = (DATA_TYPE)out30;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = (DATA_TYPE)out31;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = (DATA_TYPE)out32;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = (DATA_TYPE)out33;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3_dt                                                                       = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
 
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
 }
@@ -690,6 +721,7 @@
     Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
 #else  /* defined(SRC_DEPTH) */
+
     Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
 #endif /* defined(SRC_DEPTH) */
@@ -706,6 +738,7 @@
 #if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
 #else  /* defined(SRC_DEPTH) */
+
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
 #endif /* defined(SRC_DEPTH) */
 
@@ -740,15 +773,18 @@
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                                = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
+    vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
     DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
     DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
     DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
@@ -859,10 +895,10 @@
 #endif // defined(HAS_BIAS)
 
     // Store the output tile
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s0, (DATA_TYPE)out_col1.s0, (DATA_TYPE)out_col2.s0, (DATA_TYPE)out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s1, (DATA_TYPE)out_col1.s1, (DATA_TYPE)out_col2.s1, (DATA_TYPE)out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s2, (DATA_TYPE)out_col1.s2, (DATA_TYPE)out_col2.s2, (DATA_TYPE)out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s3, (DATA_TYPE)out_col1.s3, (DATA_TYPE)out_col2.s3, (DATA_TYPE)out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -960,18 +996,21 @@
 #endif                                                                              /* defined(SRC_DEPTH) */
     offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
-    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = (DATA_TYPE)out00;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = (DATA_TYPE)out01;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s2) = (DATA_TYPE)out02;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s3) = (DATA_TYPE)out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                      = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Get output address
     int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-
-    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = (DATA_TYPE)out00;
-    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = (DATA_TYPE)out01;
-    *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = (DATA_TYPE)out02;
-    *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = (DATA_TYPE)out03;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt                                                      = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -1094,26 +1133,37 @@
     int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
     // Store the output tile
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col0.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col1.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col2.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col3.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col0.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col1.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col2.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col3.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col0.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col1.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col2.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col3.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col0.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col1.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col2.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col3.s3;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
 /** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1181,7 +1231,9 @@
 #endif // defined(HAS_BIAS)
                                           );
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
+#if defined(VEC_SIZE) && VEC_SIZE == 4
 /** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1449,9 +1501,11 @@
 #endif // defined(HAS_BIAS)
                                            dst_size);
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
 /** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1519,7 +1573,9 @@
 #endif // defined(HAS_BIAS)
                                           );
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
+#if defined(VEC_SIZE) && VEC_SIZE == 4
 /** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1787,5 +1843,6 @@
 #endif // defined(HAS_BIAS)
                                            dst_size);
 }
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 #endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 73a4d7d..100184d 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -129,24 +129,25 @@
         b_const_int = input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
     }
 
+    const bool is_logistic_activation_quantized = is_data_type_quantized_asymmetric(dt) && act_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC;
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-    build_opts.emplace(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(!is_logistic_activation_quantized, "-DACT=" + lower_string(string_from_activation_func(act_info.activation())));
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.add_option(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
+    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
     if(is_data_type_quantized(dt))
     {
-        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+        build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+        build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
         const int   o1 = input->info()->quantization_info().offset;
         const float s1 = input->info()->quantization_info().scale;
         // Quantized value of 0 corresponds to the offset o1
-        build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
-        build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
-        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+        build_opts.add_option(("-DCONST_0=" + support::cpp11::to_string(o1)));
+        build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+        build_opts.add_option(("-DO1_VAL=" + support::cpp11::to_string(o1)));
 
         // Set scale and offset of the input and output if they have different quantization info
         if(is_data_type_quantized_asymmetric(dt) && output != nullptr)
@@ -156,22 +157,26 @@
 
             if(o1 != o2 || s1 != s2)
             {
-                build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
-                build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+                build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+                build_opts.add_option(("-DO2_VAL=" + support::cpp11::to_string(o2)));
             }
         }
     }
     else
     {
-        build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-        build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
     }
 
-    build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
 
     // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    std::string kernel_name = std::string("activation_layer");
+    if(is_data_type_quantized_asymmetric(dt))
+    {
+        kernel_name += is_logistic_activation_quantized ? std::string("_logistic_qa8") : std::string("_qa8");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Make sure _kernel is initialized before calling the parent's configure
     _input  = input;

diff --git a/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp b/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp
deleted file mode 100644
index e995ba1..0000000
--- a/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp
+++ /dev/null

@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured output
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-                                        "Wrong shape for output");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output, out_shape);
-
-        if(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output, Format::F16);
-        }
-        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output, Format::F32);
-        }
-    }
-
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticDivisionKernel::CLArithmeticDivisionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_div", build_opts));
-
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void CLArithmeticDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice);
-
-        collapsed.slide_window_slice_3D(slice_input1);
-        collapsed.slide_window_slice_3D(slice_input2);
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticDivisionKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
-}

diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
deleted file mode 100644
index 95d2011..0000000
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ /dev/null

@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
-    const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
-    if(is_qasymm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-    }
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured output
-    if(output.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
-                                        "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-                                        "Wrong shape for output");
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
-        }
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(output, out_shape);
-
-        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
-        {
-            set_format_if_unknown(output, Format::S16);
-        }
-        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
-        {
-            set_format_if_unknown(output, Format::F16);
-        }
-        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
-        {
-            set_format_if_unknown(output, Format::F32);
-        }
-    }
-
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    bool has_float_out = is_data_type_float(output->info()->data_type());
-
-    // Setup kernel
-    std::string kernel_name = "arithmetic_sub";
-
-    // Set kernel build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if_else(policy == ConvertPolicy::WRAP || has_float_out, "-DWRAP", "-DSATURATE");
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
-    {
-        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
-        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
-        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
-        build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
-        build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
-        build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
-        kernel_name += "_quantized";
-    }
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Configure kernel window
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
-
-    return Status{};
-}
-
-void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
-
-    // Collapse only if broadcast dimensions is less than 2, or in case of no broadcasting
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
-
-        enqueue(queue, *this, slice);
-
-        collapsed.slide_window_slice_3D(slice_input1);
-        collapsed.slide_window_slice_3D(slice_input2);
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticSubtractionKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
-}
-} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index bff28e3..7c30a94 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@
 Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[1] != boxes->tensor_shape()[1]);

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 53a5456..f232f6c 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
similarity index 67%
rename from src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
rename to src/core/CL/kernels/CLComparisonKernel.cpp
index 10d7fd4..f5f5a0f 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,49 +21,55 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 
-using namespace arm_compute;
+#include <map>
 
+namespace arm_compute
+{
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
+// Create supported comparisons map
+const std::map<ComparisonOperation, std::string> supported_comparison_ops =
 {
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    { ComparisonOperation::Equal, "EQUAL" },
+    { ComparisonOperation::NotEqual, "NOTEQUAL" },
+    { ComparisonOperation::Greater, "GREATER" },
+    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
+    { ComparisonOperation::Less, "LESS" },
+    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+};
 
-    const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
-    if(is_qasymm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-    }
+int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
+{
+    return 16 / input.element_size();
+}
+
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+    ARM_COMPUTE_RETURN_ERROR_ON(supported_comparison_ops.count(operation) == 0);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
-                                        "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
                                         "Wrong shape for output");
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
-        }
     }
 
     return Status{};
@@ -75,23 +81,10 @@
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(output, out_shape);
+    const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
 
-        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
-        {
-            set_format_if_unknown(output, Format::S16);
-        }
-        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
-        {
-            set_format_if_unknown(output, Format::F16);
-        }
-        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
-        {
-            set_format_if_unknown(output, Format::F32);
-        }
-    }
+    // Auto initialize output if not initialized
+    auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
 
     Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
     Window win_input1 = win.broadcast_if_dimension_le_one(input1);
@@ -112,15 +105,15 @@
 }
 } // namespace
 
-CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+CLComparisonKernel::CLComparisonKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
-void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
@@ -130,25 +123,21 @@
     _input2 = input2;
     _output = output;
 
-    const bool has_float_out = is_data_type_float(output->info()->data_type());
-
-    std::string kernel_name = "arithmetic_add";
+    const std::string &operation_name = supported_comparison_ops.at(operation);
+    std::string        kernel_name    = "compare_" + lower_string(operation_name);
 
     // Set kernel build options
     std::set<std::string> build_opts;
-    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DOP=" + operation_name);
+    build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
     if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
     {
         build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
         build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
-        build_opts.emplace("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
-        build_opts.emplace("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
-        build_opts.emplace("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
-        build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+        build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+        build_opts.emplace("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
         kernel_name += "_quantized";
     }
 
@@ -165,21 +154,20 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += (policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
 
-void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -225,9 +213,12 @@
     while(collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize CLArithmeticAdditionKernel::border_size() const
+BorderSize CLComparisonKernel::border_size() const
 {
+    const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
+
     const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
     return BorderSize(0, border, 0, 0);
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index dd7d790..70337be 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -42,9 +43,10 @@
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     const DataLayout data_layout = input->data_layout();
 

diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 4002394..3fccc04 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp

@@ -113,6 +113,13 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options()));

diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index ffbd295..e188ee9 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,8 +37,8 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
@@ -46,42 +46,20 @@
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::S16,
+                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::S16,
+                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+                                                         DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), "Input and output data types must be different");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()) && shift != 0, "Shift is used only with integer inputs");
     ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
 
-    // Check if convertion is supported
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::U16 && output->data_type() != DataType::S16
-                                                                           && output->data_type() != DataType::U32 && output->data_type() != DataType::S32),
-                                    "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32
-                                                                            && output->data_type() != DataType::S32),
-                                    "Only data types supported [in] U16 ->  [out] U8, U32, S32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32
-                                                                            && output->data_type() != DataType::S32),
-                                    "Only data types supported [in] S16 ->  [out] U8, U32, S32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U32 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U16
-                                                                            && output->data_type() != DataType::S16),
-                                    "Only data types supported [in] U32 ->  [out] U8, U16, S16");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U16
-                                                                            && output->data_type() != DataType::S16),
-                                    "Only data types supported [in] S32 ->  [out] U8, U16, S16");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
-                                    "Only data types supported [in] F16 ->  [out] F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
-                                    "Only data types supported [in] F32 ->  [out] F16");
-
     // Validate in case of configured output
     if(output->total_size() > 0)
     {
@@ -105,25 +83,33 @@
     const size_t input_size  = data_size_from_type(input->info()->data_type());
     const size_t output_size = data_size_from_type(output->info()->data_type());
 
+    // Get number of elements to process per iterations
+    const unsigned int num_elems_processed_per_iteration = 16;
+
     // Set build options
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-    build_opts.add_option_if(input_size > output_size, ((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
+    // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+    build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
+    build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || is_data_type_float(output->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
 
     // Create kernel
-    const std::string kernel_name = (input_size > output_size) ? "convert_depth_down" : "convert_depth_up";
+    const std::string kernel_name = (input_size >= output_size) ? "convert_depth_down" : "convert_depth_up";
     _kernel                       = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set shift arg
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
     _kernel.setArg(idx++, shift);
 
     // Configure kernel
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    // Collapse window
+    const Window &full_window      = window();
+    Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+    ICLKernel::configure_internal(collapsed_window);
 }
 
 Status CLDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
@@ -132,3 +118,4 @@
 
     return Status{};
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 1fce14f..5e5a35c 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,9 +37,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
@@ -54,9 +53,24 @@
                                     "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
 
-    const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+
+    const bool   is_qasymm      = is_data_type_quantized_asymmetric(input->data_type());
+    const size_t weights_width  = 3;
+    const size_t weights_height = 3;
+
+    if(is_qasymm)
+    {
+        DepthwiseConvolutionReshapeInfo info;
+        info.c0 = 4;
+        ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
+    }
 
     if(biases != nullptr)
     {
@@ -66,15 +80,16 @@
         }
         else
         {
+            ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
 
@@ -82,10 +97,13 @@
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
-                                                        const PadStrideInfo &conv_info)
+                                                        const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
+    const size_t weights_width  = 3;
+    const size_t weights_height = 3;
+
     // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, 1 /* depth_multiplier */);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output,
@@ -110,10 +128,19 @@
 
     AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
                                     ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
-    AccessWindowRectangle  output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
-    AccessWindowHorizontal weights_access(weights, 0, num_elems_accessed_per_iteration);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
 
-    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+    bool window_changed = false;
+
+    if(is_qasymm)
+    {
+        window_changed = update_window_and_padding(win, input_access, output_access);
+    }
+    else
+    {
+        AccessWindowStatic    weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+    }
 
     if(bias != nullptr)
     {
@@ -142,22 +169,9 @@
                                                          unsigned int depth_multiplier, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-    // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       output_shape,
-                       1,
-                       input->info()->data_type(),
-                       input->info()->quantization_info());
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
-
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
-    ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const bool is_qasymm         = is_data_type_quantized_asymmetric(input->info()->data_type());
     const bool is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
@@ -243,7 +257,7 @@
     }
     else
     {
-        build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+        build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
         build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
     }
     build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
@@ -252,13 +266,8 @@
     // Create kernel
     std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
                                                                                                                     && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
-
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
@@ -283,7 +292,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
                                                               biases != nullptr ? biases->clone().get() : nullptr,
-                                                              output->clone().get(), conv_info)
+                                                              output->clone().get(), conv_info, depth_multiplier)
                                 .first);
 
     return Status{};
@@ -297,6 +306,7 @@
     // Collapse window
     Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     const size_t total_batches    = _input->info()->tensor_shape().total_size_upper(3);
+    const bool   is_qasymm        = is_data_type_quantized_asymmetric(_input->info()->data_type());
 
     Window win = window_collapsed;
     win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
@@ -311,7 +321,7 @@
     Window slice_in  = win_in.first_slice_window_4D();
     Window slice_out = win.first_slice_window_4D();
 
-    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+    unsigned int idx = 2 * num_arguments_per_4D_tensor() + (is_qasymm ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
 
     if(_biases != nullptr)
     {
@@ -330,9 +340,16 @@
         unsigned int idx = 0;
         add_4D_tensor_argument(idx, _input, slice_in);
         add_4D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_out);
-
+        if(is_qasymm)
+        {
+            add_2D_tensor_argument(idx, _weights, slice_out);
+        }
+        else
+        {
+            add_3D_tensor_argument(idx, _weights, slice_out);
+        }
         enqueue(queue, *this, slice_out, lws_hint());
     }
     while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp
similarity index 85%
rename from src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp
index 683dda8..b73ccf5 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -49,6 +49,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(idx_w) * input->dimension(idx_h) + ((biases != nullptr) ? 1 : 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     if(biases != nullptr)
     {
@@ -61,12 +62,12 @@
 }
 } // namespace
 
-CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
+CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel()
     : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
 }
 
-void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
+void CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
@@ -88,23 +89,23 @@
         build_opts.emplace("-DHAS_BIAS");
     }
 
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights_generic", build_opts));
 
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
-    // The CLDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    // The CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel doesn't need padding so update_window_and_padding() can be skipped
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure_internal(win);
 }
 
-Status CLDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+Status CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
     return Status{};
 }
 
-void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
new file mode 100644
index 0000000..6b6438a
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != 3);
+
+    if(output->total_size() != 0)
+    {
+        auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), reshaped_weights_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+    auto reshaped_input_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
+    auto_init_if_empty(*output, reshaped_input_shape, 1, input->data_type(), input->quantization_info());
+
+    Window                 win = calculate_max_window(*input, Steps(info.c0));
+    AccessWindowHorizontal weights_access(input, 0, info.c0);
+    const bool             window_changed = update_window_and_padding(win, weights_access);
+
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayerReshapeWeightsKernel::CLDepthwiseConvolutionLayerReshapeWeightsKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    ICLKernel::configure_internal(win_config.second);
+
+    _input  = input;
+    _output = output;
+
+    // Build the kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0));
+    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0)));
+    build_opts.add_option_if(info.transpose, "-DTRANSPOSE");
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights", build_opts.options()));
+}
+
+Status CLDepthwiseConvolutionLayerReshapeWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info).first);
+    return Status{};
+}
+
+void CLDepthwiseConvolutionLayerReshapeWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, window);
+    add_2D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window);
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index d5c333a..56e9db5 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != output->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     return Status{};
 }

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index cdc27e8..2dad729 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,7 @@
         TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
index 5f4dacb..22149b4 100644
--- a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16,
                                                          DataType::F32);
 
     if(bias != nullptr)

diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
new file mode 100644
index 0000000..be3c7e2
--- /dev/null
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CLElementWiseUnaryLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+
+    // Configure kernel window
+    _input  = input;
+    _output = output;
+
+    const std::string kernel_name    = "elementwise_unary";
+    const int         vec_size_x     = 16 / output->info()->element_size();
+    const int         output_width_x = output->info()->tensor_shape().x();
+    const bool        multi_access_x = (output_width_x / vec_size_x > 0);
+
+    Window win = calculate_max_window(*output->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Set kernel build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            build_opts.add_option("-DOPERATION=inverse_sqrt");
+            break;
+        case ElementWiseUnary::EXP:
+            build_opts.add_option("-DOPERATION=exponential");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLElementWiseUnaryLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+
+    return Status{};
+}
+
+void CLElementWiseUnaryLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, collapsed);
+        add_1D_tensor_argument(idx, _output, collapsed);
+        enqueue(queue, *this, collapsed);
+    }
+    while(window.slide_window_slice_1D(collapsed));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
new file mode 100644
index 0000000..37eeeb7
--- /dev/null
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp

@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
+{
+    { ArithmeticOperation::ADD, "ADD" },
+    { ArithmeticOperation::SUB, "SUB" },
+    { ArithmeticOperation::DIV, "DIV" },
+    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
+    { ArithmeticOperation::MIN, "MIN" },
+    { ArithmeticOperation::MAX, "MAX" },
+};
+
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
+{
+    { ArithmeticOperation::ADD, "ADD" },
+    { ArithmeticOperation::SUB, "SUB" },
+};
+
+std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+    std::string config_id;
+    // Set config_id for enabling LWS tuning
+    config_id = kernel_name;
+    config_id += "_";
+    config_id += lower_string(string_from_data_type(input1.data_type()));
+    config_id += "_";
+    config_id += support::cpp11::to_string(output.dimension(0));
+    config_id += "_";
+    config_id += support::cpp11::to_string(output.dimension(1));
+    return config_id;
+}
+
+Status validate_arguments_with_division_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&input1, &input2, &output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+    }
+
+    return Status{};
+}
+
+Status validate_arguments_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+
+    const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
+    if(is_qasymm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+    }
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
+                                        "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+        if(is_qasymm)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+        }
+    }
+    return Status{};
+}
+
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, const std::string &operation_string)
+{
+    CLBuildOptions build_opts;
+
+    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DOP=" + operation_string);
+    if(is_data_type_quantized_asymmetric(input1.data_type()))
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1.quantization_info().offset));
+        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2.quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output.quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1.quantization_info().scale));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2.quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output.quantization_info().scale));
+    }
+    return build_opts;
+}
+
+std::pair<Status, Window> configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    set_shape_if_empty(output, out_shape);
+
+    if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+    {
+        set_format_if_unknown(output, Format::S16);
+    }
+    else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+    {
+        set_format_if_unknown(output, Format::F16);
+    }
+    else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+    {
+        set_format_if_unknown(output, Format::F32);
+    }
+
+    return configure_window_arithmetic_common(valid_region, input1, input2, output);
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+    auto_init_if_empty(output, out_shape, 1, input1.data_type());
+    return configure_window_arithmetic_common(valid_region, input1, input2, output);
+}
+} // namespace
+
+CLElementwiseOperationKernel::CLElementwiseOperationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string kernel_name = "elementwise_operation_" + name();
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+    {
+        kernel_name += "_quantized";
+    }
+
+    // Set kernel build options
+    CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    ICLKernel::configure_internal(win_config.second);
+
+    _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info());
+}
+
+void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    bool       can_collapse = true;
+    const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input1, slice_input1);
+        add_3D_tensor_argument(idx, _input2, slice_input2);
+        add_3D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice, lws_hint());
+
+        collapsed.slide_window_slice_3D(slice_input1);
+        collapsed.slide_window_slice_3D(slice_input2);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLElementwiseOperationKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}
+
+/** Arithmetic operations with saturation*/
+
+void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy)
+{
+    _policy = policy;
+    _op     = op;
+    configure_common(input1, input2, output);
+}
+
+Status CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy)
+{
+    ARM_COMPUTE_UNUSED(op, policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+
+    return Status{};
+}
+
+std::pair<Status, Window> CLSaturatedArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+}
+
+Status CLSaturatedArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    return validate_arguments_with_arithmetic_rules(input1, input2, output);
+}
+
+CLBuildOptions CLSaturatedArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    const bool has_float_out = is_data_type_float(output.data_type());
+    auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+    build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    return build_options;
+}
+std::string CLSaturatedArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+    auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
+    config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+    config_id += lower_string(string_from_data_layout(input1.data_layout()));
+    return config_id;
+}
+
+std::string CLSaturatedArithmeticOperationKernel::name()
+{
+    return supported_sat_arithmetic_ops[_op];
+}
+
+/** Arithmetic operations*/
+
+void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    _op = op;
+    configure_common(input1, input2, output);
+}
+
+Status CLArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    if(op == ArithmeticOperation::DIV)
+    {
+        // Division doesn't support integer arithmetic
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_division_rules(*input1, *input2, *output));
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*input1->clone(), *input2->clone(), *output->clone()).first);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+    }
+
+    return Status{};
+}
+std::pair<Status, Window> CLArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    if(_op == ArithmeticOperation::DIV)
+    {
+        // Division doesn't support integer arithmetic
+        return validate_and_configure_window_for_division(input1, input2, output);
+    }
+    else
+    {
+        return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+    }
+}
+Status CLArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    if(_op == ArithmeticOperation::DIV)
+    {
+        // Division doesn't support integer arithmetic
+        return validate_arguments_with_division_rules(input1, input2, output);
+    }
+    else
+    {
+        return validate_arguments_with_arithmetic_rules(input1, input2, output);
+    }
+}
+
+CLBuildOptions CLArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    return generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+}
+std::string CLArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+    return generate_id_for_tuning_common(kernel_name, input1, output);
+}
+
+std::string CLArithmeticOperationKernel::name()
+{
+    return supported_arithmetic_ops[_op];
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 6920667..5fdb826 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,25 +75,18 @@
     // Select appropriate kernel
     std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
 
-    // Define select type required by replicate border > 1
-    const DataType dt          = tensor->info()->data_type();
-    std::string    select_type = get_underlying_cl_type_from_data_type(dt);
-    if(is_data_type_float(dt))
-    {
-        select_type = (DataType::F32 == dt) ? "int" : "short";
-    }
+    const DataType dt = tensor->info()->data_type();
 
     // Define build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
-    build_opts.emplace(("-DSELECT_TYPE=" + select_type));
-    build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
-    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
-    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
-    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+    build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
+    build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
+    build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
+    build_opts.add_option("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     _tensor = tensor;
 
     // Create static kernel arguments
@@ -141,8 +134,9 @@
                 set_constant_border<float>(idx, constant_border_value);
                 break;
             case DataType::F16:
+                static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half");
                 static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
-                set_constant_border<cl_half>(idx, constant_border_value);
+                set_constant_border<half>(idx, constant_border_value);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not handled");

diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 5c38568..ef47d20 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,7 @@
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index f333c1b..0857702 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height, reinterpret_input_as_3d));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index b2fb3e0..1a1a4b7 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,11 +71,25 @@
     }
     else
     {
-        const int m                         = reshape_info.m();
-        const int n                         = reshape_info.n();
-        const int k                         = reshape_info.k();
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+        GEMMRHSMatrixInfo rhs_info;
+        GEMMLHSMatrixInfo lhs_info;
+        const int         m                         = reshape_info.m();
+        const int         n                         = reshape_info.n();
+        const int         k                         = reshape_info.k();
+        const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+        const bool        unroll_block              = dot8_supported(CLKernelLibrary::get().get_device());
+
+        rhs_info.n0         = 16 / input1->element_size();
+        rhs_info.k0         = 1;
+        rhs_info.h0         = mult_transpose1xW_width;
+        rhs_info.interleave = false;
+        rhs_info.transpose  = false;
+        lhs_info.m0         = 4;
+        lhs_info.k0         = 4;
+        lhs_info.v0         = mult_interleave4x4_height;
+        lhs_info.interleave = true;
+        lhs_info.transpose  = !unroll_block;
 
         TensorShape tensor_shape0{ input0->tensor_shape() };
         tensor_shape0.set(0, k);
@@ -88,8 +102,8 @@
         const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
         const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
 
-        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
-        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000..e9be1a6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp

@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape0{ input0->tensor_shape() };
+    tensor_shape0.set(0, k);
+    tensor_shape0.set(1, m);
+
+    TensorShape tensor_shape1{ input1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*output);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(output->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+    const int m          = gemm_info.m();
+    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input0_access(input0, 0, 0,
+                                     ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+                                     input0->dimension(1));
+    AccessWindowStatic input1_access(input1, 0, 0,
+                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(1));
+    AccessWindowStatic output_access(output, 0, 0,
+                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                     output->dimension(1) + bottom_pad);
+
+    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+
+    output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
+
+    _input0                   = input0;
+    _input1                   = input1;
+    _output                   = output;
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _k                        = gemm_info.k();
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_");
+    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+    kernel_name += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 825d7fb..803ed30 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,14 +86,13 @@
     _input  = input;
     _output = output;
 
-    std::ostringstream ma_arguments;
-    ma_arguments << "-DBETA=" << beta;
-    std::set<std::string> build_opts;
-    build_opts.emplace(ma_arguments.str());
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(beta));
 
     // Create kernel
     std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
-    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info());

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index c9ed776..2b004c2 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,15 +40,16 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
 using ElementsProcessed = Steps;
 
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
-                                 bool fp_mixed_precision)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
+                                 bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
@@ -60,17 +61,40 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
 
+    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+    const bool has_vec_c   = input2 != nullptr && beta != 0.f;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(has_vec_c && !is_beta_one, "Adding input2 is only supported for beta equal to 1");
+
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(has_vec_c)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->dimension(0) != input1->dimension(0), "Length of Vector C must match the number of columns of matrix B");
+        }
     }
     else
     {
-        const int m                         = reshape_info.m();
-        const int n                         = reshape_info.n();
-        const int k                         = reshape_info.k();
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+        GEMMRHSMatrixInfo rhs_info;
+        GEMMLHSMatrixInfo lhs_info;
+        const int         m                         = reshape_info.m();
+        const int         n                         = reshape_info.n();
+        const int         k                         = reshape_info.k();
+        const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+        rhs_info.n0                                 = 16 / input1->element_size();
+        rhs_info.k0                                 = 1;
+        rhs_info.h0                                 = mult_transpose1xW_width;
+        rhs_info.interleave                         = false;
+        rhs_info.transpose                          = false;
+        lhs_info.m0                                 = 4;
+        lhs_info.k0                                 = 4;
+        lhs_info.v0                                 = mult_interleave4x4_height;
+        lhs_info.interleave                         = true;
+        lhs_info.transpose                          = true;
 
         TensorShape tensor_shape0{ input0->tensor_shape() };
         tensor_shape0.set(0, k);
@@ -83,11 +107,17 @@
         const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
         const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
 
-        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
-        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(has_vec_c)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+        }
     }
 
     if(output->total_size() != 0)
@@ -100,10 +130,11 @@
     return Status{};
 }
 
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
-                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+                                                               float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
                                                                ElementsProcessed &num_elements_processed)
 {
+    ARM_COMPUTE_UNUSED(beta);
     bool   window_changed = false;
     Window win{};
     Window win_out{};
@@ -113,6 +144,7 @@
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
     bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
+    const bool     has_vec_c                           = input2 != nullptr && beta != 0.f;
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -153,16 +185,21 @@
         win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
         win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowStatic    input1_access(input1, 0, 0,
-                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
+        AccessWindowStatic input1_access(input1, 0, 0,
+                                         ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                         ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
         AccessWindowStatic output_access(output, 0, 0,
                                          ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
                                          output->dimension(1) + bottom_pad);
 
         window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
                          update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+        if(has_vec_c)
+        {
+            AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, input2_access);
+        }
 
         output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
     }
@@ -196,6 +233,11 @@
 
         window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
                          update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+        if(has_vec_c)
+        {
+            AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, input2_access);
+        }
 
         Coordinates coord;
         coord.set_num_dimensions(output->num_dimensions());
@@ -214,20 +256,22 @@
 } // namespace
 
 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _has_vec_c(false)
 {
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
-                                           bool fp_mixed_precision)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+                                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
+                                                  is_interleaved_transposed, reshape_info, fp_mixed_precision));
 
     _input0                   = input0;
     _input1                   = input1;
+    _input2                   = input2;
     _output                   = output;
     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
@@ -253,7 +297,8 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
+                                                    gpu_target, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -275,6 +320,8 @@
 
     const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
 
+    _has_vec_c = input2 != nullptr && beta != 0.f;
+
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
@@ -338,6 +385,9 @@
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
     }
 
+    // Configure matrix C addition if necessary
+    build_opts.add_option_if(_has_vec_c, "-DADD_VEC_C");
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -360,16 +410,18 @@
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
-                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
+                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
 {
     // Note: num_elements_processed will be set in validate_and_configure_window()
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
+                                                              (input2 != nullptr) ? input2->clone().get() : nullptr,
                                                               output->clone().get(),
+                                                              beta,
                                                               is_interleaved_transposed,
                                                               reshape_info,
                                                               gpu_target,
@@ -396,10 +448,12 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
+    const unsigned int num_arguments_vec_c = (_has_vec_c) ? num_arguments_per_1D_tensor() : 0;
+
     if(_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_vec_c;
         const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -407,7 +461,7 @@
     if(_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_vec_c;
         const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -425,6 +479,10 @@
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
+        if(_has_vec_c)
+        {
+            add_1D_tensor_argument(idx, _input2, slice);
+        }
         add_2D_tensor_argument(idx, _output, slice);
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
@@ -433,3 +491,4 @@
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000..b6816ac
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp

@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape0{ input0->tensor_shape() };
+    tensor_shape0.set(0, k);
+    tensor_shape0.set(1, m);
+
+    TensorShape tensor_shape1{ input1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+    const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
+
+    TensorInfo tmp_info(*output);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(output->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+    const int m          = gemm_info.m();
+    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input0_access(input0, 0, 0,
+                                     ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+                                     input0->dimension(1));
+    AccessWindowStatic input1_access(input1, 0, 0,
+                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(1));
+    AccessWindowStatic output_access(output, 0, 0,
+                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                     output->dimension(1) + bottom_pad);
+
+    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+
+    output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+{
+}
+
+void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, lhs_info, rhs_info, gemm_info));
+
+    _input0                   = input0;
+    _input1                   = input1;
+    _output                   = output;
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _k                        = gemm_info.k();
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+    build_opts.add_option_if(std::abs(1.0f - alpha) > 0.00001f, "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+    std::string kernel_name("gemm_mm_reshaped_");
+    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              lhs_info,
+                                                              rhs_info,
+                                                              gemm_info,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
new file mode 100644
index 0000000..72f2ca4
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp

@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+    const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
+    const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
+    bool               window_changed                      = false;
+
+    TensorInfo tmp_info(*input);
+
+    if(reinterpret_input_as_3d)
+    {
+        // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(input->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
+
+    // Configure window
+    // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
+    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+    const int m          = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
+    const int bottom_pad = ceil_to_multiple(m, num_elems_processed_per_iteration_y) - m;
+
+    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input_access(input, 0, 0,
+                                    ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
+                                    input->dimension(1) + bottom_pad);
+    AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+    window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
+                     update_window_and_padding(win, output_access);     // window used to update the padding requirements of output tensor
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMReshapeLHSMatrixKernel::CLGEMMReshapeLHSMatrixKernel()
+    : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
+{
+}
+
+void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
+
+    _input                   = input;
+    _output                  = output;
+    _reinterpret_input_as_3d = reinterpret_input_as_3d;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
+    std::string kernel_name("gemm_reshape_lhs_matrix_");
+    kernel_name += lhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), lhs_info, reinterpret_input_as_3d);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemm_reshape_lhs_matrix_";
+    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.transpose);
+}
+
+Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, lhs_info, reinterpret_input_as_3d));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), lhs_info, reinterpret_input_as_3d).first);
+
+    return Status{};
+}
+
+void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
+        const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
new file mode 100644
index 0000000..5b9e68d
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp

@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+    const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
+    const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
+    bool               window_changed                      = false;
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*input, rhs_info)));
+
+    // Configure window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+    window_changed = update_window_and_padding(win, input_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMReshapeRHSMatrixKernel::CLGEMMReshapeRHSMatrixKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), rhs_info));
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
+    build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
+    std::string kernel_name("gemm_reshape_rhs_matrix_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), rhs_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGEMMReshapeRHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, rhs_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), rhs_info).first);
+
+    return Status{};
+}
+
+void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index aa1b92a..986a009 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
                                                            compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
new file mode 100644
index 0000000..412821b
--- /dev/null
+++ b/src/core/CL/kernels/CLGatherKernel.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+    ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+    // Create window
+    Window win = calculate_max_window(*output, Steps());
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherKernel::CLGatherKernel()
+    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _input   = input;
+    _output  = output;
+    _indices = indices;
+    _axis    = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gather", build_opts.options()));
+    ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    return Status{};
+}
+
+void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    unsigned int idx              = 0;
+    add_4D_tensor_argument(idx, _input, window_collapsed);
+    add_1D_tensor_argument(idx, _indices, window_collapsed);
+    add_4D_tensor_argument(idx, _output, window_collapsed);
+    enqueue(queue, *this, window_collapsed);
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 5d100a4..ab95ddc 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@
 Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(anchors);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 54ef23f..8caa927 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,6 +74,7 @@
         const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};
@@ -192,11 +193,15 @@
         num_elems_processed_per_iteration = 2;
         is_padding_required_nchw          = false;
 
-        // Only the 3x3 case is optimized for NHWC
+        // Only the 3x3 and 9x9 cases are optimized for NHWC
         if(kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
         }
+        else if(kernel_dims == Size2D(9U, 9U))
+        {
+            kernel_name = "im2col9x9_";
+        }
 
         build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
         build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 97dd919..e33dab0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -49,8 +50,9 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
 
     // Reduce shape on axis

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 67357da..9623ec6 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -37,20 +37,21 @@
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 4;
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
-                                    "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
@@ -62,8 +63,6 @@
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-
     const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
     const bool         is_norm_accross_width = norm_idx == 0;
 
@@ -118,15 +117,14 @@
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-    const bool         is_in_map_2D                      = (norm_info.type() == NormType::IN_MAP_2D);
-
     const DataLayout   data_layout  = input->info()->data_layout();
     const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
     _is_norm_across_width           = norm_idx == 0;
     const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
     _border_size                    = BorderSize(0, border_width);
 
+    const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -140,8 +138,24 @@
     build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
 
     // Create kernel
-    std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    std::string kernel_name;
+    if(norm_info.is_in_map())
+    {
+        kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
+    }
+    else
+    {
+        if(data_layout == DataLayout::NCHW)
+        {
+            kernel_name = "normalization_layer_cross_map";
+        }
+        else
+        {
+            // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
+            kernel_name = "normalization_layer_in_map_nchw";
+        }
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index a44507b..9033016 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index a9a2c5c..a5fc1a7 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,17 +56,22 @@
                                                          DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
-                                    && (perm != PermutationVector{ 1U, 2U, 0U })
-                                    && (perm != PermutationVector{ 3U, 2U, 0U, 1U }),
-                                    "Only [2, 0, 1], [1, 2, 0] and [3, 2, 0, 1] permutation is supported");
 
-    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4,
+                                    "Permutation upto 4-D input tensor is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+                                    "Permutation vector size should be less than or equal to 4");
+    for(const auto &p : perm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+    }
 
     // Validate configured output
     if(output->total_size() != 0)
     {
+        const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
     return Status{};
@@ -87,30 +92,16 @@
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create kernel
-    std::set<std::string> build_opts;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
+    build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+    build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+    build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+    build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
 
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    // Run [2, 0, 1] permute
-    if(_perm == PermutationVector{ 2U, 0U, 1U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
-    }
-    // Run [1, 2, 0] permute
-    else if(_perm == PermutationVector{ 1U, 2U, 0U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
-    }
-    // Run [3, 2, 0, 1] permute
-    else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported.");
-    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute", build_opts.options()));
 
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index bd21ea0..7081688 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,6 +78,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
     }

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 63e745e..c76d839 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp

@@ -73,8 +73,7 @@
 
     if(output != nullptr && output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
 
     return Status{};
@@ -87,29 +86,11 @@
     TensorShape output_shape = compute_prior_box_shape(*input1, info);
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
-    Window win{};
-    bool   window_changed = false;
-
-    switch(input1->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-
-            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, output_access);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            win = calculate_max_window(*output, Steps());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    };
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
+    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
+    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -188,25 +169,8 @@
         }
     }
 
-    unsigned int idx = 0;
-    // Create kernel
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            idx     = num_arguments_per_2D_tensor();
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            idx     = num_arguments_per_3D_tensor();
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nhwc", build_opts.options()));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    unsigned int idx = num_arguments_per_2D_tensor();
+    _kernel          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
 
     _kernel.setArg(idx++, *_min);
     _kernel.setArg(idx++, *_max);
@@ -245,31 +209,11 @@
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
 
-    switch(_input1->info()->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+    Window slice = window.first_slice_window_2D();
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window slice = window.first_slice_window_3D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * _num_priors));
-            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 325eeb2..66d2623 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
@@ -47,18 +50,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
-        ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
     }
-
     return Status{};
 }
 
@@ -67,8 +67,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+    const TensorShape output_shape = compute_roi_align_shape(*input, *rois, pool_info);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
+    output->set_data_layout(input->data_layout());
 
     // Configure kernel window
     const unsigned int num_elems_processed_per_iteration = 1;
@@ -107,12 +108,13 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
+    build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
     // Create kernel
@@ -137,7 +139,7 @@
     Window slice_rois = slice;
     // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
     slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
-    slice.set(Window::DimZ, window[3]);
+    slice.set(get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL), window[3]);
 
     // Set arguments
     unsigned int idx = 0;

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 2367694..df7687e 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,29 +39,61 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    Window             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+
+    //Validate arguments
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
 
-    // Output auto inizialitation if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    if(output->info()->total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+    }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() != output->info()->dimension(3));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     // Set instance variables
     _input     = input;
@@ -89,19 +121,7 @@
     add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
     add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
 
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 1;
-    Window             window                            = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input->info(),
-                                    input->info()->valid_region().start(0),
-                                    input->info()->valid_region().start(1),
-                                    input->info()->valid_region().end(0),
-                                    input->info()->valid_region().end(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
-
-    update_window_and_padding(window, input_access, output_access);
-    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure_internal(window);
+    ICLKernel::configure_internal(win_config.second);
 }
 
 void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -109,14 +129,20 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+    Window slice      = window.first_slice_window_3D();
+    Window slice_rois = slice;
+    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+    slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
     slice.set(Window::DimZ, window[3]);
 
     // Set arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, slice);
-    add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+    add_2D_tensor_argument(idx, _rois, slice_rois);
     add_3D_tensor_argument(idx, _output, slice);
+    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
     enqueue(queue, *this, slice);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
new file mode 100644
index 0000000..eb8822b
--- /dev/null
+++ b/src/core/CL/kernels/CLRangeKernel.cpp

@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+namespace
+{
+unsigned int get_num_elems_processed_per_iteration(const DataType dt)
+{
+    unsigned int num_elems_processed_per_iteration = preferred_vector_width(CLKernelLibrary::get().get_device(), dt);
+    if(num_elems_processed_per_iteration > 8)
+    {
+        num_elems_processed_per_iteration = 8; //kernel uses only 8 lanes.
+    }
+    return num_elems_processed_per_iteration;
+}
+
+Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &output, const float start, const float end, const float step)
+{
+    unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output.data_type());
+    // Auto initialize output if not initialized
+    auto_init_if_empty(output, TensorShape(num_of_elements_in_range(start, end, step)), 1, output.data_type(), output.quantization_info());
+
+    // Configure kernel window
+    Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), TensorShape(num_of_elements_in_range(start, end, step))));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLRangeKernel::CLRangeKernel()
+    : _start(0), _end(1), _step(1), _output(nullptr)
+{
+}
+
+void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*(output->info()), start, end, step);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _start  = start;
+    _end    = end;
+    _step   = step;
+    _output = output;
+
+    std::string kernel_name = "range";
+
+    unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output->info()->data_type());
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
+    build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
+    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    {
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+        kernel_name += "_quantized";
+    }
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(output->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+}
+
+Status CLRangeKernel::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*output, start, end, step));
+    ARM_COMPUTE_RETURN_ON_ERROR((validate_and_configure_window(*(output->clone()), start, end, step)).first);
+
+    return Status{};
+}
+
+void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+    unsigned int idx = 0;
+    add_1D_tensor_argument(idx, _output, window);
+
+    enqueue(queue, *this, window, lws_hint());
+}

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index ef46325..1f4cff3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -45,6 +46,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
@@ -53,29 +55,41 @@
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        }
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Output tensor auto initialization if not yet initialized
     TensorShape output_shape{ input->tensor_shape() };
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
+    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+    auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
     Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool               window_changed                    = false;
+    const bool         is_serial_op                      = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type()));
 
     switch(axis)
     {
         case 0:
         {
-            if(is_data_type_quantized(input->data_type()))
+            if(is_serial_op)
             {
                 AccessWindowHorizontal input_access(input, 0, input->dimension(0));
                 AccessWindowHorizontal output_access(output, 0, 1);
@@ -136,14 +150,17 @@
     // Set build options
     CLBuildOptions build_opts;
     std::string    data_type_promoted = get_cl_type_from_data_type(input->info()->data_type());
-    if(is_data_type_quantized(input->info()->data_type()) && axis != 0)
+    if(is_data_type_quantized(input->info()->data_type()))
     {
         data_type_promoted = "uint";
     }
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
-    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
+    build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
+    build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
+    build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
 
     switch(op)
     {
@@ -154,6 +171,12 @@
         case ReductionOperation::MEAN_SUM:
             build_opts.add_option(("-DOPERATION=sum"));
             break;
+        case ReductionOperation::ARG_IDX_MAX:
+        case ReductionOperation::ARG_IDX_MIN:
+            break;
+        case ReductionOperation::PROD:
+            build_opts.add_option(("-DOPERATION=product"));
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction operation");
     }
@@ -161,11 +184,18 @@
     // Create kernel
     cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
+    const bool  is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type()));
     switch(axis)
     {
         case 0:
         {
-            if(!is_data_type_quantized(input->info()->data_type()))
+            if(is_serial_op)
+            {
+                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+                kernel_axis_name = "non_parallel_x";
+            }
+            else
             {
                 build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
                 const unsigned int width_leftover = input->info()->dimension(0) % border_val;
@@ -178,11 +208,6 @@
                 lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
                 _border_size = BorderSize(0, border_width, 0, 0);
             }
-            else
-            {
-                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-                kernel_axis_name = "quantized_x";
-            }
         }
         break;
         case 1:
@@ -204,7 +229,7 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
@@ -214,7 +239,7 @@
 Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
 
     return Status{};
 }
@@ -224,39 +249,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
+    const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type()));
     switch(_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(!is_data_type_quantized(_input->info()->data_type()))
-            {
-                // Set out window
-                Window out_window(window);
-                out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-                // Get first input and output slices
-                Window in_slice  = window.first_slice_window_2D();
-                Window out_slice = out_window.first_slice_window_2D();
-
-                // Reshape window
-                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
-                // Set local sums buffer
-                unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
-                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
-
-                do
-                {
-                    unsigned int idx = 0;
-                    add_2D_tensor_argument(idx, _input, in_slice);
-                    add_2D_tensor_argument(idx, _output, out_slice);
-                    enqueue(queue, *this, in_slice, lws_hint());
-                }
-                while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-            }
-            else
+            if(is_serial_op)
             {
                 // Get first input and output slices
                 Window window_in{ window };
@@ -274,6 +273,33 @@
                 }
                 while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
             }
+            else
+            {
+                // Set out window
+                Window out_window(window);
+                out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+                // Get first input and output slices
+                Window in_slice  = window.first_slice_window_2D();
+                Window out_slice = out_window.first_slice_window_2D();
+
+                // Reshape window
+                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
+                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+                // Set local sums buffer
+                unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
+                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
+
+                do
+                {
+                    unsigned int idx = 0;
+                    add_2D_tensor_argument(idx, _input, in_slice);
+                    add_2D_tensor_argument(idx, _output, out_slice);
+                    enqueue(queue, *this, in_slice, lws_hint());
+                }
+                while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            }
         }
         break;
         case 1:

diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
new file mode 100644
index 0000000..84bf5bf
--- /dev/null
+++ b/src/core/CL/kernels/CLReverseKernel.cpp

@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLReverseKernel::CLReverseKernel()
+    : _input(nullptr), _output(nullptr), _axis(nullptr)
+{
+}
+
+void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+
+    _input  = input;
+    _output = output;
+    _axis   = axis;
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+
+    // Set kernel build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reverse", build_opts.options()));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_1D_tensor();
+    add_argument<cl_uint>(idx, input->info()->dimension(0));
+    add_argument<cl_uint>(idx, input->info()->dimension(1));
+    add_argument<cl_uint>(idx, input->info()->dimension(2));
+    add_argument<cl_uint>(idx, input->info()->dimension(3));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id += "reverse_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+}
+
+Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+    return Status{};
+}
+
+void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed  = window.collapse(ICLKernel::window(), Window::DimZ);
+    Window slice      = collapsed.first_slice_window_4D();
+    Window axis_slice = collapsed.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, slice);
+        add_1D_tensor_argument(idx, _axis, axis_slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index ce6c016..cd89d1c 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,6 +65,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
 
     float wr = 0.f;
@@ -94,14 +95,11 @@
 
             num_elems_processed_per_iteration = 4;
             // Configure kernel window
-            win                                   = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            const ValidRegion &input_valid_region = input->valid_region();
-
-            // Reads can occur within the valid region of the input
+            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
             AccessWindowStatic input_access(input,
-                                            input_valid_region.anchor[0] - border.left, input_valid_region.anchor[1] - border.top,
-                                            input_valid_region.anchor[0] + input_valid_region.shape[0] + border.right,
-                                            input_valid_region.anchor[1] + input_valid_region.shape[1] + border.bottom);
+                                            -border.left, -border.top,
+                                            input->dimension(0) + border.right,
+                                            input->dimension(1) + border.bottom);
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
             output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
@@ -118,7 +116,9 @@
             num_elems_processed_per_iteration = 1;
             // Configure kernel window
             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowRectangle  input_access(input, -border.left, -border.top, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+            AccessWindowStatic input_access(input, -border.left, -border.top,
+                                            input->dimension(0) + border.right,
+                                            input->dimension(1) + border.bottom);
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
             window_changed = update_window_and_padding(win, input_access, output_access);
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -175,6 +175,7 @@
     DataLayout data_layout = input->info()->data_layout();
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_nhwc     = data_layout == DataLayout::NHWC;
 
     // Compute the ratio between source width/height and destination width/height
     const unsigned int input_width   = input->info()->dimension(idx_width);
@@ -201,6 +202,7 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
     build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
     if(call_quantized_kernel)
     {
@@ -215,7 +217,7 @@
     kernel_name += lower_string(string_from_data_layout(data_layout));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
 
     // Set static kernel arguments
     const float scale_x = static_cast<float>(input_width) / output_width;
@@ -225,6 +227,20 @@
     _kernel.setArg<float>(idx++, input_height);
     _kernel.setArg<float>(idx++, scale_x);
     _kernel.setArg<float>(idx++, scale_y);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "scale_";
+    _config_id += (border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
+    _config_id += (sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
+    _config_id += (is_nhwc ? "nhwc" : "nchw");
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
 }
 
 void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -250,16 +266,13 @@
         }
         case DataLayout::NHWC:
         {
-            Window slice = window.first_slice_window_3D();
+            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+            Window slice     = collapsed.first_slice_window_4D();
 
-            do
-            {
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, slice);
-                add_3D_tensor_argument(idx, _output, slice);
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice));
+            unsigned int idx = 0;
+            add_4D_tensor_argument(idx, _input, slice);
+            add_4D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice, lws_hint());
             break;
         }
         default:

diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
new file mode 100644
index 0000000..c9e5da0
--- /dev/null
+++ b/src/core/CL/kernels/CLSelectKernel.cpp

@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(x);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(x,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, y);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::U8);
+
+    const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *c, ITensorInfo *x, ITensorInfo *y, ITensorInfo *output)
+{
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output, *x->clone());
+    }
+
+    const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+
+    const unsigned int num_elems_processed_per_iteration = 16 / x->element_size();
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*x, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal x_access(x, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal y_access(y, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, x_access, y_access);
+
+    // Update window for condition
+    if(is_same_rank)
+    {
+        AccessWindowHorizontal c_access(c, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, c_access);
+    }
+
+    // Update window for output
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, x->valid_region());
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLSelectKernel::CLSelectKernel()
+    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+{
+}
+void CLSelectKernel::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
+
+    _c             = c;
+    _x             = x;
+    _y             = y;
+    _output        = output;
+    _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
+
+    const unsigned int num_elems_processed_per_iteration = 16 / x->info()->element_size();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(x->info()->data_type()));
+    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(x->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    // Create kernel
+    std::string kernel_name = "select";
+    if(_has_same_rank)
+    {
+        kernel_name += "_same_rank";
+    }
+    else
+    {
+        const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
+        if(is_input_rank_greater_than_two)
+        {
+            const size_t width      = x->info()->tensor_shape().x();
+            const size_t height     = x->info()->tensor_shape().y();
+            const size_t outer_size = x->info()->tensor_shape()[x->info()->tensor_shape().num_dimensions() - 1];
+            const size_t depth_size = x->info()->tensor_shape().total_size() / (width * height * outer_size);
+            build_opts.add_option("-DDEPTH_SIZE=" + support::cpp11::to_string(depth_size));
+        }
+        kernel_name += "_different_rank";
+        kernel_name += is_input_rank_greater_than_two ? "_n" : "_2";
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(c->info(), x->info(), y->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    _config_id = "select_";
+    _config_id += string_from_data_type(x->info()->data_type());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(x->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(x->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(x->info()->dimension(2));
+}
+
+Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(c->clone().get(), x->clone().get(), y->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    if(!_has_same_rank)
+    {
+        Window vector_slice = window.first_slice_window_1D();
+        vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _c, vector_slice);
+    }
+
+    do
+    {
+        unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
+        if(_has_same_rank)
+        {
+            add_3D_tensor_argument(idx, _c, slice);
+        }
+        add_3D_tensor_argument(idx, _x, slice);
+        add_3D_tensor_argument(idx, _y, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index d488631..f039198 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp

@@ -39,10 +39,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
+        const DataLayout data_layout = input->data_layout();
+        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -64,8 +70,8 @@
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
         const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] / block_shape_x != (output->tensor_shape()[idx_width] - padding_left.x() - padding_right.y()));
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] / block_shape_y != (output->tensor_shape()[idx_height] - padding_left.x() - padding_right.y()));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -101,6 +107,9 @@
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
 
     // Configure kernel window
@@ -132,6 +141,9 @@
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
     build_opts.add_option("-DPAD_LEFT_X=" + support::cpp11::to_string(padding_left.x()));

diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
new file mode 100644
index 0000000..ccbe1fc
--- /dev/null
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+{
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input);
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLStackLayerKernel::CLStackLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DAXIS=" + support::cpp11::to_string(axis));
+    build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DDST_DIM3=" + support::cpp11::to_string(output->info()->dimension(3)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("stack_layer", build_opts.options()));
+
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    const unsigned int idx = 2 * num_arguments_per_4D_tensor();
+    _kernel.setArg<cl_uint>(idx, idx_input);
+}
+
+Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    return Status{};
+}
+
+void CLStackLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window window_out;
+    window_out.use_tensor_dimensions(_output->info()->tensor_shape());
+
+    Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+    Window slice_in  = collapsed.first_slice_window_4D();
+    Window slice_out = window_out.first_slice_window_4D();
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+}

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 2d2ba10..c40f3c9 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/Window.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
@@ -69,7 +70,8 @@
     // Checks output if configured
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape() != exp_output_shape);
+        const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -113,9 +115,11 @@
 
     const TensorShape &input_shape = input->info()->tensor_shape();
 
-    const Coordinates final_strides = arm_compute::helpers::tensor_transform::strided_slice_strides(input_shape, strides);
-    const Coordinates starts_abs    = arm_compute::helpers::tensor_transform::strided_slice_absolute_start_coords(input_shape, starts, final_strides, begin_mask);
-    const Coordinates ends_abs      = arm_compute::helpers::tensor_transform::strided_slice_absolute_end_coords(input_shape, starts_abs, ends, final_strides, end_mask, shrink_axis_mask);
+    Coordinates starts_abs, ends_abs, final_strides;
+    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
+                                                        input_shape,
+                                                        starts, ends, strides,
+                                                        begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
@@ -124,7 +128,8 @@
     // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
     const int  vec_size_x     = 16 / input->info()->element_size();
     const int  output_width_x = output->info()->tensor_shape().x();
-    const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
+    const bool is_shrink_on_x = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
+    const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
     if(multi_access_x)
@@ -140,8 +145,10 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
+        const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
         build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
         build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));

diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
new file mode 100644
index 0000000..7559e7a
--- /dev/null
+++ b/src/core/CL/kernels/CLTileKernel.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
+    {
+        return e == 0;
+    }));
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLTileKernel::CLTileKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Auto initialize output
+    TensorShape tiled_shape = misc::shape_calculator::compute_tiled_shape(input->info()->tensor_shape(), multiples);
+    auto_init_if_empty(*output->info(), tiled_shape, 1, input->info()->data_type());
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), multiples));
+
+    _input  = input;
+    _output = output;
+
+    const DataType     data_type      = input->info()->data_type();
+    const int          vec_size_x     = 16 / input->info()->element_size();
+    const int          input_width_x  = input->info()->tensor_shape().x();
+    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
+    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width_x));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DSRC_BATCHES=" + support::cpp11::to_string(input->info()->dimension(3)));
+    build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("tile", build_opts.options()));
+
+    // Configure window without padding
+    Window win = calculate_max_window(*output->info());
+
+    if(multi_access_x)
+    {
+        // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
+        // as many threads as those to cover a single tile times multiples[0]. Note that if threads
+        // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
+        // we don't need to pad the output
+        const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+    }
+
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "tile";
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    for(unsigned int i = 0; i < multiples.size(); ++i)
+    {
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(input->info()->dimension(i));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(multiples[i]);
+    }
+}
+
+Status CLTileKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, multiples));
+    return Status{};
+}
+
+void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index ccf22ea..6c237a8 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,6 +69,7 @@
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index ee3fa11..ce5ed86 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -49,12 +50,15 @@
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     return Status{};
 }

diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index b0d27cb..d58cef5 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,9 +49,11 @@
 {
     // The window needs to be based on the output
     Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
-    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
-                                     input2->dimension(1));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration), input1->dimension(1));
+    const unsigned int input2_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(
+                                                  0) + num_elems_processed_per_iteration - input2->dimension(0);
+    AccessWindowStatic input2_access(input2, -(input1->dimension(0) % num_elems_processed_per_iteration),
+                                     0, input2->dimension(0) + input2_right_padding, input2->dimension(1));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
 
@@ -109,6 +111,16 @@
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
 
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+    }
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
 
@@ -118,6 +130,14 @@
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
+    // Pass paddings as arguments to the kernel
+    const unsigned int input1_width         = input1->info()->dimension(0);
+    const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
+    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
+    unsigned int       idx0                 = 3 * num_arguments_per_4D_tensor();
+    _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
+    _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
+
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x2_";
     _config_id += lower_string(string_from_data_type(input1->info()->data_type()));

diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 75aef9c..9cbb713 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,15 +47,29 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
 {
+    const unsigned int input1_width = input1->dimension(0);
+    const unsigned int input2_width = input2->dimension(0);
+    const unsigned int input3_width = input3->dimension(0);
+    const unsigned int input4_width = input4->dimension(0);
+
     // The window needs to be based on the output
     Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
-    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
-                                     input2->dimension(1));
-    AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
-                                     input3->dimension(1));
-    AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
-                                     input4->dimension(1));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1_width, num_elems_processed_per_iteration), input1->dimension(1));
+
+    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
+    const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
+                                              input2_width;
+    AccessWindowStatic input2_access(input2, -input2_left_padding, 0, input2_width + input2_right_padding, input2->dimension(1));
+
+    const unsigned int input3_left_padding  = (input1_width + input2_width) % num_elems_processed_per_iteration;
+    const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
+                                              num_elems_processed_per_iteration - input3_width;
+    AccessWindowStatic input3_access(input3, -input3_left_padding, 0, input3_width + input3_right_padding, input3->dimension(1));
+
+    const unsigned int input4_left_padding  = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
+    const unsigned int input4_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration + num_elems_processed_per_iteration - output->dimension(0);
+    AccessWindowStatic input4_access(input4, -input4_left_padding, 0, input4_width + input4_right_padding, input4->dimension(1));
+
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
 
@@ -119,6 +133,20 @@
     build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
 
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().scale));
+    }
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
 
@@ -128,6 +156,27 @@
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
+    // Pass paddings as arguments to the kernel
+    const unsigned int input1_width = input1->info()->dimension(0);
+    const unsigned int input2_width = input2->info()->dimension(0);
+    const unsigned int input3_width = input3->info()->dimension(0);
+
+    const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
+    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
+    const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
+                                              input2_width;
+    const unsigned int input3_left_padding  = (input1_width + input2_width) % num_elems_processed_per_iteration;
+    const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
+                                              num_elems_processed_per_iteration - input3_width;
+    const unsigned int input4_left_padding  = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
+    unsigned int       idx0                 = 5 * num_arguments_per_4D_tensor();
+    _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
+    _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
+    _kernel.setArg<cl_uint>(idx0++, input2_right_padding);
+    _kernel.setArg<cl_uint>(idx0++, input3_left_padding);
+    _kernel.setArg<cl_uint>(idx0++, input3_right_padding);
+    _kernel.setArg<cl_uint>(idx0++, input4_left_padding);
+
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x4_";
     _config_id += lower_string(string_from_data_type(input1->info()->data_type()));

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index c51c579..6c32cd2 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,9 +107,16 @@
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
 
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    }
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 7f1afe0..84b5ea2 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp

@@ -46,8 +46,18 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
 {
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                        && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                        && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+                                        && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+                                        "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
 
@@ -133,14 +143,14 @@
 {
 }
 
-void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info)
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
 
     _input  = input;
     _bias   = bias;
@@ -161,6 +171,21 @@
 
     // Set build options
     CLBuildOptions build_opts;
+    build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    {
+        build_opts.add_option("-DVEC_SIZE=2");
+    }
+    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    {
+        build_opts.add_option("-DVEC_SIZE=4");
+    }
+
+    build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+
     build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
     build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
     build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
@@ -195,9 +220,9 @@
     _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
 }
 
-Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info, act_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
 
     return Status{};

diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 7d9dbd4..ee9bdec 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,10 +43,10 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
 {
-    ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
 
     const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);

diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 06a0551..02150ff 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 17eaec2..d9fe5b0 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -58,17 +58,6 @@
     return Status{};
 }
 
-template <typename T>
-inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
-{
-    const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
-    {
-        T dimension_val = old_dim[i];
-        dimensions.set(perm[i], dimension_val);
-    }
-}
-
 } // namespace
 
 template <typename T>

diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
new file mode 100644
index 0000000..533543a
--- /dev/null
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T,
+          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+inline bool greater_than(T a, T b)
+{
+    const T epsilon = std::numeric_limits<T>::epsilon();
+    return (a - b > epsilon);
+}
+
+template < typename T,
+           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+inline bool greater_than(T a, T b)
+{
+    return (a > b);
+}
+
+Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <typename T>
+void CPPTopKVKernel::run_topkv()
+{
+    for(unsigned int i = 0; i < _batch_size; ++i)
+    {
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
+        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+
+        // The variable rank indicates how many values there are before the target_class_id
+        unsigned int rank = 0;
+        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        {
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
+            if(greater_than(current_prediction, predicted_value))
+            {
+                rank++;
+            }
+        }
+        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+    }
+}
+
+CPPTopKVKernel::CPPTopKVKernel()
+    : _predictions(nullptr), _targets(nullptr), _output(nullptr), _k(), _batch_size(), _num_classes()
+{
+}
+
+void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(predictions->info(), targets->info(), output->info(), k));
+    auto_init_if_empty(*output->info(), targets->info()->tensor_shape(), 1, DataType::U8);
+
+    _predictions = predictions;
+    _targets     = targets;
+    _output      = output;
+
+    _k           = k;
+    _batch_size  = predictions->info()->dimension(1);
+    _num_classes = predictions->info()->dimension(0);
+
+    ICPPKernel::configure(Window()); // Default 1 iteration window
+}
+
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
+    return Status{};
+}
+
+bool CPPTopKVKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(window, info);
+    switch(_predictions->info()->data_type())
+    {
+        case DataType::F32:
+            run_topkv<float>();
+            break;
+        case DataType::F16:
+            run_topkv<half>();
+            break;
+        case DataType::S32:
+            run_topkv<int>();
+            break;
+        case DataType::QASYMM8:
+            run_topkv<uint8_t>();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 2f6a94b..e7b4365 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp

@@ -54,9 +54,9 @@
     va_start(args, msg);
     auto err = create_error_va_list(ErrorCode::RUNTIME_ERROR, function, file, line, msg, args);
     va_end(args);
-    throw std::runtime_error(err.error_description());
+    ARM_COMPUTE_THROW(std::runtime_error(err.error_description()));
 }
 void Status::internal_throw_on_error() const
 {
-    throw std::runtime_error(_error_description);
+    ARM_COMPUTE_THROW(std::runtime_error(_error_description));
 }

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 3dffcd0..e6c80e8 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,6 +71,7 @@
     src_it, dst_it);
 }
 
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
 void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
 {
     ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
@@ -151,6 +152,7 @@
         }
     }
 }
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
 
 bool ITensor::is_used() const
 {

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 5ce79f1..b67396c 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/QAsymm8.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -60,29 +61,21 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    Window                 win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool                   window_changed                    = false;
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
 
-    if(output != nullptr && (output->total_size() != 0))
+    if(output != nullptr)
     {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
 
-        window_changed = update_window_and_padding(win,
-                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration),
-                                                   output_access);
-
-        output_access.set_valid_region(win, input->valid_region());
-    }
-    else
-    {
-        // In-place computation
-        window_changed = update_window_and_padding(win,
-                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+        // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } // namespace
 
@@ -101,16 +94,15 @@
 
     if(output != nullptr)
     {
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), *input->info()->clone());
         _output = output;
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
 
-    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                             && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
-                             "For QASYMM8 only relu and lower/upper bounded relu are supported");
+    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+                             && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                             && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+                             "For QASYMM8 only logistic, relu and lower/upper bounded relu are supported");
 
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -149,6 +141,8 @@
     // Activation functions : QASYMM8
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
     {
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qasymm8_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t> },
         { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
         { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qasymm8_t> },
     };
@@ -176,337 +170,129 @@
     ICPPKernel::configure(win_config.second);
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
-    static const float16x8_t CONST_0   = vdupq_n_f16(0.f);
-    static const float16x8_t CONST_1_H = vdupq_n_f16(1.f);
+    const int                window_step_x  = 16 / sizeof(T);
+    const auto               window_start_x = static_cast<int>(window.x().start());
+    const auto               window_end_x   = static_cast<int>(window.x().end());
+    const ActivationFunction act            = F;
 
-    static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f);
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    const float16x8_t a   = vdupq_n_f16(_act_info.a());
-    const float16x4_t a_h = vdup_n_f16(_act_info.a());
-    const float16x8_t b   = vdupq_n_f16(_act_info.b());
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
 
-    execute_window_loop(window, [&](const Coordinates &)
+    const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+    const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+    const auto va      = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
+    const auto vb      = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
+    const auto a       = static_cast<T>(_act_info.a());
+    const auto b       = static_cast<T>(_act_info.b());
+
+    execute_window_loop(win_collapsed, [&](const Coordinates & id)
     {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        const float16x8x2_t in  = vld2q_f16(input_ptr);
-        float16x8x2_t       tmp = { {} };
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
 
-        switch(F)
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            case ActivationFunction::ABS:
-                tmp =
-                {
-                    {
-                        vabsq_f16(in.val[0]),
-                        vabsq_f16(in.val[1]),
-                    }
-                };
-                break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])),
-                        vminq_f16(a, vmaxq_f16(CONST_0, in.val[1]))
-                    }
-                };
-                break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_f16(a, vmaxq_f16(b, in.val[0])),
-                        vminq_f16(a, vmaxq_f16(b, in.val[1]))
-                    }
-                };
-                break;
-            case ActivationFunction::LINEAR:
-                tmp =
-                {
-                    {
-                        vaddq_f16(b, vmulq_f16(a, in.val[0])),
-                        vaddq_f16(b, vmulq_f16(a, in.val[1]))
-                    }
-                };
-                break;
-            case ActivationFunction::LOGISTIC:
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
             {
-                tmp =
-                {
-                    {
-                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))),
-                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1]))))
-                    }
-                };
+                case ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+                    break;
+                case ActivationFunction::SQRT:
+                    tmp = wrapper::vinv(wrapper::vinvsqrt(vin));
+                    break;
+                case ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
             }
-            break;
-            case ActivationFunction::RELU:
-                tmp =
-                {
-                    {
-                        vmaxq_f16(CONST_0, in.val[0]),
-                        vmaxq_f16(CONST_0, in.val[1])
-                    }
-                };
-                break;
-            case ActivationFunction::LEAKY_RELU:
-                tmp =
-                {
-                    {
-                        vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])),
-                        vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1]))
-                    }
-                };
-                break;
-            case ActivationFunction::SOFT_RELU:
-            {
-                // TODO (COMPMID-1535) : Revisit FP16 approximations
-                const float16x4x2_t in0 =
-                {
-                    vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))),
-                    vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[0])))))),
-                };
-
-                const float16x4x2_t in1 =
-                {
-                    vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[1])))))),
-                    vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[1])))))),
-                };
-
-                tmp =
-                {
-                    {
-                        vcombine_f16(in0.val[0], in0.val[1]),
-                        vcombine_f16(in1.val[0], in1.val[1]),
-                    }
-                };
-            }
-            break;
-            case ActivationFunction::SQRT:
-                tmp =
-                {
-                    {
-                        vinvq_f16(vinvsqrtq_f16(in.val[0])),
-                        vinvq_f16(vinvsqrtq_f16(in.val[1])),
-                    }
-                };
-                break;
-            case ActivationFunction::SQUARE:
-                tmp =
-                {
-                    {
-                        vmulq_f16(in.val[0], in.val[0]),
-                        vmulq_f16(in.val[1], in.val[1])
-                    }
-                };
-                break;
-            case ActivationFunction::TANH:
-            {
-                // TODO (COMPMID-1535) : Revisit FP16 approximations
-                const float16x8x2_t mul =
-                {
-                    vmulq_f16(b, in.val[0]),
-                    vmulq_f16(b, in.val[1])
-                };
-                const float16x4x2_t in0 =
-                {
-                    vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[0]))))),
-                    vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[0]))))),
-                };
-
-                const float16x4x2_t in1 =
-                {
-                    vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[1]))))),
-                    vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[1]))))),
-                };
-
-                tmp =
-                {
-                    {
-                        vcombine_f16(in0.val[0], in0.val[1]),
-                        vcombine_f16(in1.val[0], in1.val[1]),
-                    }
-                };
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-                break;
+            wrapper::vstore(output_ptr + x, tmp);
         }
 
-        vst2q_f16(output_ptr, tmp);
-    },
-    input, output);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-    static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
-    const float32x4_t        a       = vdupq_n_f32(_act_info.a());
-    const float32x4_t        b       = vdupq_n_f32(_act_info.b());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        const float32x4x4_t in =
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
+            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+            T       tmp;
+            switch(act)
             {
-                vld1q_f32(input_ptr),
-                vld1q_f32(input_ptr + 4),
-                vld1q_f32(input_ptr + 8),
-                vld1q_f32(input_ptr + 12)
+                case ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationFunction::LOGISTIC:
+                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+                    break;
+                case ActivationFunction::RELU:
+                    tmp = std::max<T>(static_cast<T>(0), in);
+                    break;
+                case ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+                    break;
+                case ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<T>(a, std::max<T>(b, in));
+                    break;
+                case ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationFunction::SOFT_RELU:
+                    tmp = std::log(static_cast<T>(1) + std::exp(in));
+                    break;
+                case ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
             }
-        };
-        float32x4x4_t tmp = { {} };
-
-        switch(F)
-        {
-            case ActivationFunction::ABS:
-                tmp =
-                {
-                    {
-                        vabsq_f32(in.val[0]),
-                        vabsq_f32(in.val[1]),
-                        vabsq_f32(in.val[2]),
-                        vabsq_f32(in.val[3]),
-                    }
-                };
-                break;
-            case ActivationFunction::LINEAR:
-                tmp =
-                {
-                    {
-                        vmlaq_f32(b, a, in.val[0]),
-                        vmlaq_f32(b, a, in.val[1]),
-                        vmlaq_f32(b, a, in.val[2]),
-                        vmlaq_f32(b, a, in.val[3]),
-                    }
-                };
-                break;
-            case ActivationFunction::LOGISTIC:
-                tmp =
-                {
-                    {
-                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))),
-                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))),
-                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))),
-                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))),
-                    }
-                };
-                break;
-            case ActivationFunction::RELU:
-                tmp =
-                {
-                    {
-                        vmaxq_f32(CONST_0, in.val[0]),
-                        vmaxq_f32(CONST_0, in.val[1]),
-                        vmaxq_f32(CONST_0, in.val[2]),
-                        vmaxq_f32(CONST_0, in.val[3]),
-                    }
-                };
-                break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
-                    }
-                };
-                break;
-            case ActivationFunction::LU_BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_f32(a, vmaxq_f32(b, in.val[0])),
-                        vminq_f32(a, vmaxq_f32(b, in.val[1])),
-                        vminq_f32(a, vmaxq_f32(b, in.val[2])),
-                        vminq_f32(a, vmaxq_f32(b, in.val[3])),
-                    }
-                };
-                break;
-            case ActivationFunction::LEAKY_RELU:
-                tmp =
-                {
-                    {
-                        vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])),
-                        vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])),
-                        vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])),
-                        vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])),
-                    }
-                };
-                break;
-            case ActivationFunction::SOFT_RELU:
-                tmp =
-                {
-                    {
-                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))),
-                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))),
-                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))),
-                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))),
-                    }
-                };
-                break;
-            case ActivationFunction::SQRT:
-                tmp =
-                {
-                    {
-                        vinvq_f32(vinvsqrtq_f32(in.val[0])),
-                        vinvq_f32(vinvsqrtq_f32(in.val[1])),
-                        vinvq_f32(vinvsqrtq_f32(in.val[2])),
-                        vinvq_f32(vinvsqrtq_f32(in.val[3])),
-                    }
-                };
-                break;
-            case ActivationFunction::SQUARE:
-                tmp =
-                {
-                    {
-                        vmulq_f32(in.val[0], in.val[0]),
-                        vmulq_f32(in.val[1], in.val[1]),
-                        vmulq_f32(in.val[2], in.val[2]),
-                        vmulq_f32(in.val[3], in.val[3]),
-                    }
-                };
-                break;
-            case ActivationFunction::TANH:
-                tmp =
-                {
-                    {
-                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))),
-                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))),
-                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))),
-                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))),
-                    }
-                };
-                break;
-            default:
-                break;
+            *(output_ptr + x) = tmp;
         }
-
-        vst1q_f32(output_ptr, tmp.val[0]);
-        vst1q_f32(output_ptr + 4, tmp.val[1]);
-        vst1q_f32(output_ptr + 8, tmp.val[2]);
-        vst1q_f32(output_ptr + 12, tmp.val[3]);
     },
     input, output);
 }
@@ -514,13 +300,25 @@
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator               input(_input, window);
-    Iterator               output(_output, window);
-    const QuantizationInfo qi_in   = _input->info()->quantization_info();
-    const QuantizationInfo qi_out  = _output->info()->quantization_info();
-    const qasymm8x16_t     a       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
-    const qasymm8x16_t     b       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
-    const qasymm8x16_t     CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset));
+    const int                window_step_x  = 16 / sizeof(T);
+    const auto               window_start_x = static_cast<int>(window.x().start());
+    const auto               window_end_x   = static_cast<int>(window.x().end());
+    const ActivationFunction act            = F;
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+
+    const QuantizationInfo qi_in    = _input->info()->quantization_info();
+    const QuantizationInfo qi_out   = _output->info()->quantization_info();
+    const qasymm8x16_t     va       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t     vb       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    const qasymm8_t        a        = sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset);
+    const qasymm8_t        b        = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset);
+    const qasymm8_t        const_0  = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset);
+    const qasymm8x16_t     vconst_0 = vdupq_n_u8(const_0);
 
     // Initialise scale/offset for re-quantization
     float       s  = qi_in.scale / qi_out.scale;
@@ -528,34 +326,116 @@
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(win_collapsed, [&](const Coordinates & id)
     {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        const qasymm8x16_t in  = vld1q_u8(input_ptr);
-        qasymm8x16_t       tmp = {};
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
 
-        switch(F)
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            case ActivationFunction::LU_BOUNDED_RELU:
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationFunction::RELU)
+            {
                 // Perform activation
-                tmp = vminq_u8(a, vmaxq_u8(b, in));
+                tmp = vmaxq_u8(vconst_0, vin);
                 // Re-quantize to new output space
                 tmp = vmlaq_qasymm8(tmp, vs, vo);
-                break;
-            case ActivationFunction::RELU:
+            }
+            else if(act == ActivationFunction::BOUNDED_RELU)
+            {
                 // Perform activation
-                tmp = vmaxq_u8(CONST_0, in);
+                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
                 // Re-quantize to new output space
                 tmp = vmlaq_qasymm8(tmp, vs, vo);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Function not implemented");
-                break;
+            }
+            else if(act == ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationFunction::LOGISTIC)
+            {
+                const auto scale_in  = vdupq_n_f32(qi_in.scale);
+                const auto off_in    = vdupq_n_f32(qi_in.offset);
+                const auto scale_out = vdupq_n_f32(qi_out.scale);
+                const auto off_out   = vdupq_n_f32(qi_out.offset);
+                const auto vconst_1  = vdupq_n_f32(1.f);
+
+                const auto vin_low        = wrapper::vgetlow(vin);
+                const auto vin_high       = wrapper::vgethigh(vin);
+                uint16x8_t vin_low_u16x8  = wrapper::vmovl(vin_low);
+                uint16x8_t vin_high_u16x8 = wrapper::vmovl(vin_high);
+                // Convert uint16 vectors to uint32 vectors
+                uint32x4_t A_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_low_u16x8));
+                uint32x4_t B_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_low_u16x8));
+                uint32x4_t C_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_high_u16x8));
+                uint32x4_t D_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_high_u16x8));
+                // Convert uint32 vectors to float32 vectors
+                float32x4_t A_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(A_u32x4), off_in), scale_in);
+                float32x4_t B_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(B_u32x4), off_in), scale_in);
+                float32x4_t C_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(C_u32x4), off_in), scale_in);
+                float32x4_t D_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(D_u32x4), off_in), scale_in);
+                // Perform activation
+                A_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(A_f32x4))));
+                B_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(B_f32x4))));
+                C_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(C_f32x4))));
+                D_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(D_f32x4))));
+                // Convert float32 vectors to uint32 vectors
+                A_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(A_f32x4, scale_out), off_out));
+                B_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(B_f32x4, scale_out), off_out));
+                C_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(C_f32x4, scale_out), off_out));
+                D_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(D_f32x4, scale_out), off_out));
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                vin_low_u16x8  = wrapper::vcombine(wrapper::vqmovn(A_u32x4), wrapper::vqmovn(B_u32x4));
+                vin_high_u16x8 = wrapper::vcombine(wrapper::vqmovn(C_u32x4), wrapper::vqmovn(D_u32x4));
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = wrapper::vcombine(wrapper::vqmovn(vin_low_u16x8), wrapper::vqmovn(vin_high_u16x8));
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
         }
 
-        vst1q_u8(output_ptr, tmp);
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            T in = *(reinterpret_cast<const T *>(input_ptr + x));
+            T tmp;
+            if(act == ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+            }
+            else if(act == ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+            }
+            else if(act == ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+            }
+            else if(act == ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = scvt_f32_qasymm8(in, qi_in.scale, qi_in.offset);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = sqcvt_qasymm8_f32(tmp_f, qi_out.scale, qi_out.offset);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
     },
     input, output);
 }

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 169554f..ffa578f 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -47,282 +48,426 @@
 
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+template <typename T, bool is_sat>
+void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
+    ARM_COMPUTE_UNUSED(policy);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(T);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    if(is_broadcast_across_x)
     {
-        vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
-    },
-    input1, input2, output);
-}
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
-    },
-    input1, input2, output);
-}
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
 
-inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
-{
-    const int16x8x2_t res =
-    {
+        execute_window_loop(win, [&](const Coordinates & id)
         {
-            vaddq_s16(a.val[0], b.val[0]),
-            vaddq_s16(a.val[1], b.val[1])
+            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                const auto res             = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                *(output_ptr + x)          = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto val1 = wrapper::vloadq(input1_ptr + x);
+                const auto val2 = wrapper::vloadq(input2_ptr + x);
+                const auto res  = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto val1   = *(input1_ptr + x);
+                const auto val2   = *(input2_ptr + x);
+                *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2;
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    const float output_scale    = out->info()->quantization_info().scale;
+    const int   output_offset   = out->info()->quantization_info().offset;
+
+    const float32x4_t vscale1    = vdupq_n_f32(in1->info()->quantization_info().scale);
+    const float32x4_t vscale2    = vdupq_n_f32(in2->info()->quantization_info().scale);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+    const int32x4_t   voffset1   = vdupq_n_s32(in1->info()->quantization_info().offset);
+    const int32x4_t   voffset2   = vdupq_n_s32(in2->info()->quantization_info().offset);
+    const float32x4_t voffseto   = vdupq_n_f32(output_offset);
+
+    if(is_broadcast_across_x)
+    {
+        const bool             is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                 broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                 non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor         *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor         *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+        const QuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info();
+        const QuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info();
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+            const uint8_t    broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+            const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+
+            const float32x4x4_t bf =
+            {
+                {
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
+                }
+            };
+            const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t    a = vld1q_u8(non_broadcast_input_ptr + x);
+                const float32x4x4_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+                    }
+                };
+
+                const int32x4x4_t rf =
+                {
+                    {
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else //__aarch64__
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
+                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+        const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t a = vld1q_u8(input1_ptr + x);
+                const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+                const float32x4x4_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+                    }
+                };
+
+                const float32x4x4_t bf =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
+                    }
+                };
+
+                const int32x4x4_t rf =
+                {
+                    {
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else //__aarch64__
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale;
+                const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale;
+                *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        if(policy == ConvertPolicy::WRAP)
+        {
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = wrapper::vloadq(input1_ptr + x);
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
+            }
         }
-    };
-
-    return res;
-}
-
-inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-    const float32x4x4_t res =
-    {
+        else
         {
-            vaddq_f32(a.val[0], b.val[0]),
-            vaddq_f32(a.val[1], b.val[1]),
-            vaddq_f32(a.val[2], b.val[2]),
-            vaddq_f32(a.val[3], b.val[3])
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = wrapper::vloadq(input1_ptr + x);
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
+            }
         }
-    };
-
-    return res;
+    },
+    input1, input2, output);
 }
 
-inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
 {
-    const int16x8x2_t res =
+    // Simply swap the two input buffers:
+    add_S16_U8_S16(input2, input1, output, policy, window);
+}
+
+void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates & id)
     {
+        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        if(policy == ConvertPolicy::WRAP)
         {
-            vqaddq_s16(a.val[0], b.val[0]),
-            vqaddq_s16(a.val[1], b.val[1])
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
+            }
         }
-    };
-
-    return res;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
-{
-    const float16x8x2_t res =
-    {
+        else
         {
-            vaddq_f16(a.val[0], b.val[0]),
-            vaddq_f16(a.val[1], b.val[1])
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
+                                                     static_cast<int16_t>(*(input2_ptr + x)));
+            }
         }
-    };
-
-    return res;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
-        const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
-
-        vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
-    },
-    input1, input2, output);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(in1);
-    ARM_COMPUTE_UNUSED(in2);
-    ARM_COMPUTE_UNUSED(out);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-
-void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
-        const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
-
-        vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
-    },
-    input1, input2, output);
-}
-
-void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
-        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-
-        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
-    },
-    input1, input2, output);
-}
-
-void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
-        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-
-        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
-    },
-    input1, input2, output);
-}
-
-void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int16x8x2_t a =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
-            }
-        };
-        const uint8x16_t b = vld1q_u8(input2.ptr());
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
-    },
-    input1, input2, output);
-}
-
-void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int16x8x2_t a =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
-            }
-        };
-        const uint8x16_t b = vld1q_u8(input2.ptr());
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
-    },
-    input1, input2, output);
-}
-
-inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
-{
-    //Simply swap the two input buffers:
-    add_wrap_S16_U8_S16(input2, input1, output, window);
-}
-
-inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
-{
-    //Simply swap the two input buffers:
-    add_saturate_S16_U8_S16(input2, input1, output, window);
-}
-
-void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8x16_t a = vld1q_u8(input1.ptr());
-        const uint8x16_t b = vld1q_u8(input2.ptr());
-
-        const int16x8x2_t a_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
-            }
-        };
-
-        const int16x8x2_t b_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
-            }
-        };
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
-    },
-    input1, input2, output);
-}
-
-void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8x16_t a = vld1q_u8(input1.ptr());
-        const uint8x16_t b = vld1q_u8(input2.ptr());
-
-        const int16x8x2_t a_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
-            }
-        };
-
-        const int16x8x2_t b_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
-            }
-        };
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
     },
     input1, input2, output);
 }
@@ -332,12 +477,15 @@
     ARM_COMPUTE_UNUSED(policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
+                                                                                                 || (input2.data_type() != output.data_type())),
+                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
 
     // Validate in case of configured output
     if(output.total_size() > 0)
@@ -349,7 +497,8 @@
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
-            && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
+            && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
+            && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8),
             "You called addition with the wrong image formats");
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -381,29 +530,26 @@
         {
             set_format_if_unknown(output, Format::F32);
         }
+        else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
+        {
+            set_data_type_if_unknown(output, DataType::QASYMM8);
+        }
     }
 
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+    Window win = calculate_max_window(valid_region, Steps());
 
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output.num_dimensions());
+    output.set_valid_region(valid_region);
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
+    ;
 }
 } // namespace
 
 NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy()
 {
 }
 
@@ -418,25 +564,30 @@
 
     static std::map<std::string, AddFunction *> map_function =
     {
-        { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
-        { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
-        { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
-        { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
-        { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
-        { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
-        { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
-        { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
-        { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
-        { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
-        { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
-        { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
-        { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
-        { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
+        { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
+        { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
+        { "add_wrap_U8_U8_U8", &add_same<uint8_t, false> },
+        { "add_saturate_U8_U8_U8", &add_same<uint8_t, true> },
+        { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
+        { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
+        { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
+        { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
+        { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
+        { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
+        { "add_wrap_S16_S16_S16", &add_same<int16_t, false> },
+        { "add_saturate_S16_S16_S16", &add_same<int16_t, true> },
+        { "add_wrap_F32_F32_F32", &add_same<float, false> },
+        { "add_saturate_F32_F32_F32", &add_same<float, false> },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { "add_wrap_F16_F16_F16", &add_same<float16_t, false> },
+        { "add_saturate_F16_F16_F16", &add_same<float16_t, false> },
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     };
 
     _input1 = input1;
     _input2 = input2;
     _output = output;
+    _policy = policy;
 
     std::string function_to_call("add_");
     function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
@@ -471,12 +622,5 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input1, _input2, _output, window);
-}
-
-BorderSize NEArithmeticAdditionKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize(0, border, 0, 0);
+    (*_func)(_input1, _input2, _output, _policy, window);
 }

diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index c1e3e1f..ed83286 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp

@@ -42,10 +42,10 @@
 
 namespace
 {
-template <typename T, int S>
+template <typename T>
 inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, T *__restrict output)
 {
-    using type      = typename wrapper::traits::neon_vector<T, S>::type;
+    using type      = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::type;
     const type val1 = vloadq(static_cast<const T *>(input1));
     const type val2 = vloadq(static_cast<const T *>(input2));
 
@@ -108,7 +108,7 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        bitwise_and<uint8_t, 16>(input1.ptr(), input2.ptr(), output.ptr());
+        bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
     },
     input1, input2, output);
 }

diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index f8217d3..b2b0dbd 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
                                                          1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8,

diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index d6517ac..e3661ee 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 8c875cd..8352c94 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -57,14 +58,30 @@
     Iterator input(in, window);
     Iterator output(out, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    const DataType          dt           = in->info()->data_type();
+    const QuantizationInfo &input_qinfo  = in->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = out->info()->quantization_info();
+    if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
     {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
-        const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const uint8_t *>(input_ptr + input.offset());
+            const auto out_ptr = reinterpret_cast<uint8_t *>(output_ptr + output.offset());
+            vst1q_u8(out_ptr, vquantize(vdequantize(vld1q_u8(in_ptr), input_qinfo), output_qinfo));
+        },
+        input, output);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
+            const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
 
-        wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
-    },
-    input, output);
+            wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+        },
+        input, output);
+    }
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)

diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 158f401..5433755 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -43,10 +44,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
+                                    "Only data_types supported [in] QASYMM8 -> [out] F16, F32");
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
                                                                            && output->data_type() != DataType::S32),
                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32");
@@ -57,11 +61,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
-                                    "Only data_types supported [in] F16 ->  [out] F32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F32),
+                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
-                                    "Only data_types supported [in] F32 ->  [out] F16");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16),
+                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, F16");
 
     // Validate in case of configured output
     if(output->total_size() > 0)
@@ -134,6 +138,75 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
+        {
+            switch(_output->info()->data_type())
+            {
+                /* Up-conversion QASYMM8 -> F32 */
+                case DataType::F32:
+                {
+                    const float32x4_t scale  = vdupq_n_f32(_input->info()->quantization_info().scale);
+                    const int32x4_t   offset = vdupq_n_s32(_input->info()->quantization_info().offset);
+
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t   texels_u8 = vld1q_u8(input.ptr());
+                        const uint16x8x2_t texels_u16 =
+                        {
+                            {
+                                vmovl_u8(vget_low_u8(texels_u8)),
+                                vmovl_u8(vget_high_u8(texels_u8))
+                            }
+                        };
+
+                        const int32x4x4_t texels_s32 =
+                        {
+                            {
+                                vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[0]))),
+                                vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[0]))),
+                                vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[1]))),
+                                vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[1])))
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[0], offset)), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[1], offset)), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[2], offset)), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[3], offset)), scale));
+                    },
+                    input, output);
+                    break;
+                }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                /* Up-conversion QASYMM8 -> F16 */
+                case DataType::F16:
+                {
+                    const float16x8_t scale  = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale));
+                    const int16x8_t   offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset));
+
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t  texels_u8 = vld1q_u8(input.ptr());
+                        const int16x8x2_t texels_s16 =
+                        {
+                            {
+                                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
+                            }
+                        };
+
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[0], offset)), scale));
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[1], offset)), scale));
+                    },
+                    input, output);
+                    break;
+                }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         case DataType::U8:
         {
             const int16x8_t b = vdupq_n_s16(_shift);
@@ -367,6 +440,31 @@
         case DataType::F16:
             switch(_output->info()->data_type())
             {
+                case DataType::QASYMM8:
+                {
+                    const float16x8_t scale        = vinvq_f16(vdupq_n_f16(static_cast<float16_t>(_output->info()->quantization_info().scale)));
+                    const int16x8_t   offset       = vdupq_n_s16(static_cast<int16_t>(_output->info()->quantization_info().offset));
+                    const int16x8_t   max_val_vec  = vdupq_n_s16(255);
+                    const int16x8_t   zero_val_vec = vdupq_n_s16(0);
+
+                    /* Down-conversion F16 -> QASYMM8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float16x8x2_t texels =
+                        {
+                            {
+                                vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())), scale),
+                                vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8), scale),
+                            }
+                        };
+
+                        const auto texel_quantized_0 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[0]), offset), max_val_vec), zero_val_vec);
+                        const auto texel_quantized_1 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[1]), offset), max_val_vec), zero_val_vec);
+                        vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(texel_quantized_0), vqmovun_s16(texel_quantized_1)));
+                    },
+                    input, output);
+                    break;
+                }
                 case DataType::F32:
                 {
                     const float32x4_t scale = vdupq_n_f32(1 << _shift);
@@ -394,9 +492,44 @@
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }
             break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
             switch(_output->info()->data_type())
             {
+                case DataType::QASYMM8:
+                {
+                    const float32x4_t scale        = vinvq_f32(vdupq_n_f32(_output->info()->quantization_info().scale));
+                    const int32x4_t   offset       = vdupq_n_s32(_output->info()->quantization_info().offset);
+                    const int32x4_t   max_val_vec  = vdupq_n_s32(255);
+                    const int32x4_t   zero_val_vec = vdupq_n_s32(0);
+
+                    /* Down-conversion F32 -> QASYMM8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels =
+                        {
+                            {
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
+                            }
+                        };
+
+                        const auto texel_quantized_0 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[0]), offset), max_val_vec), zero_val_vec);
+                        const auto texel_quantized_1 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[1]), offset), max_val_vec), zero_val_vec);
+                        const auto texel_quantized_2 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[2]), offset), max_val_vec), zero_val_vec);
+                        const auto texel_quantized_3 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[3]), offset), max_val_vec), zero_val_vec);
+
+                        const auto converted_0 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_0), vqmovun_s32(texel_quantized_1)));
+                        const auto converted_1 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_2), vqmovun_s32(texel_quantized_3)));
+
+                        vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(converted_0, converted_1));
+                    },
+                    input, output);
+                    break;
+                }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
                 {
                     const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
@@ -420,11 +553,11 @@
                     input, output);
                     break;
                 }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
                 default:
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }
             break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 99bdb7a..6071153 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -146,6 +147,7 @@
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
@@ -347,6 +349,7 @@
 
 void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
 {
+    ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(_input);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);

diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index e8fb8cd..62373e3 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     return Status{};
 }

diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 921582a..37269ca 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,7 @@
         TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 77ab5ad..b0e1fcb 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,6 +91,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index a571d54..09836f1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -431,21 +431,13 @@
     uint8x16_t      min                           = vdupq_n_u8(0);
     uint8x16_t      max                           = vdupq_n_u8(255);
 
-    Window window_bias = window;
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
     Iterator in(input, window);
-    Iterator bi(bias, window_bias);
-
     Iterator out(output, window);
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Get bias and pointer to input
+        // Get pointer to input
         const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
 
-        // Accumulate bias
         int32x4x4_t v_in =
         {
             {
@@ -459,7 +451,7 @@
         const auto out_ptr = out.ptr();
         vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
     },
-    in, bi, out);
+    in, out);
 }
 } // namespace
 
@@ -498,6 +490,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
+    const bool has_bias = bias != nullptr;
+
     // Set appropriate function
     if(input->info()->data_layout() == DataLayout::NCHW)
     {
@@ -511,13 +505,27 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
+                if(has_bias)
+                {
+                    _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, false> : &output_stage_nchw<float16_t, float16_t, false, false>;
+                }
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
+                if(has_bias)
+                {
+                    _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage_nchw<float, float, true, false> : &output_stage_nchw<float, float, false, false>;
+                }
                 break;
             }
             default:
@@ -532,19 +540,33 @@
         {
             case DataType::S32:
             {
-                _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+                _func = (bias == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+                if(has_bias)
+                {
+                    _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, false> : &output_stage_nhwc<float16_t, float16_t, false, false>;
+                }
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+                if(has_bias)
+                {
+                    _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, false> : &output_stage_nhwc<float, float, false, false>;
+                }
                 break;
             }
             default:

diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
new file mode 100644
index 0000000..aa458c2
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp

@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+{
+    qasymm8x16_t        x = vld1q_u8(input1_ptr);
+    const float32x4x4_t out =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
+        }
+    };
+    return out;
+}
+
+void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
+{
+    const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
+    const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
+    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
+{
+    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+{
+    int32x4x4_t out =
+    {
+        {
+            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+        }
+    };
+    store_quantized(output_ptr, out);
+}
+
+float32x4x4_t dup_quantized(qasymm8_t broadcast_value, int offset, float scale)
+{
+    const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+    const int32x4_t    voffset             = vdupq_n_s32(offset);
+    const float32x4_t  vscale              = vdupq_n_f32(scale);
+
+    const float32x4x4_t broadcast_vector =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset)), vscale),
+        }
+    };
+    return broadcast_vector;
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+    auto res = ScalarType(0);
+
+    switch(op)
+    {
+        case ArithmeticOperation::MAX:
+            res = std::max(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = std::min(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            res = (a - b) * (a - b);
+            break;
+        }
+        case ArithmeticOperation::DIV:
+        {
+            res = a / b;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res;
+}
+
+template <ArithmeticOperation op>
+inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, QuantizationInfo qinfo)
+{
+    return qinfo.quantize(elementwise_arithm_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+}
+
+template <ArithmeticOperation op, typename VectorType>
+inline VectorType elementwise_arithm_op(const VectorType &a, const VectorType &b)
+{
+    VectorType res = { 0, 0, 0, 0 };
+
+    switch(op)
+    {
+        case ArithmeticOperation::MAX:
+            res = wrapper::vmax(a, b);
+            break;
+        case ArithmeticOperation::MIN:
+            res = wrapper::vmin(a, b);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+        {
+            const VectorType tmp = wrapper::vsub(a, b);
+            res                  = wrapper::vmul(tmp, tmp);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, float32x4_t>(const float32x4_t &a, const float32x4_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, float16x8_t>(const float16x8_t &a, const float16x8_t &b)
+{
+    return wrapper::vdiv(a, b);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <ArithmeticOperation op>
+inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    float32x4x4_t out =
+    {
+        {
+            elementwise_arithm_op<op>(a.val[0], b.val[0]),
+            elementwise_arithm_op<op>(a.val[1], b.val[1]),
+            elementwise_arithm_op<op>(a.val[2], b.val[2]),
+            elementwise_arithm_op<op>(a.val[3], b.val[3]),
+        }
+    };
+    return out;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_arithm_op_broadcast(const VectorType &a, const ScalarType &broadcast_value, const bool reorder)
+{
+    VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+    return elementwise_arithm_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ComparisonOperation op, typename InputScalarType>
+inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
+{
+    bool res = false;
+
+    switch(op)
+    {
+        case ComparisonOperation::Equal:
+            res = (a == b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = (a != b);
+            break;
+        case ComparisonOperation::Greater:
+            res = (a > b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = (a >= b);
+            break;
+        case ComparisonOperation::Less:
+            res = (a < b);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = (a <= b);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
+}
+
+template <ComparisonOperation op>
+inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, QuantizationInfo qinfo)
+{
+    ARM_COMPUTE_UNUSED(qinfo);
+    return elementwise_comp_op_scalar<op>(a, b);
+}
+
+template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
+{
+    OutputVectorType res = { 0, 0, 0, 0 };
+
+    switch(op)
+    {
+        case ComparisonOperation::Equal:
+            res = wrapper::vceq(a, b);
+            break;
+        case ComparisonOperation::NotEqual:
+            res = wrapper::vnot(wrapper::vceq(a, b));
+            break;
+        case ComparisonOperation::Greater:
+            res = wrapper::vcgt(a, b);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            res = wrapper::vcge(a, b);
+            break;
+        case ComparisonOperation::Less:
+            res = wrapper::vcgt(b, a);
+            break;
+        case ComparisonOperation::LessEqual:
+            res = wrapper::vcge(b, a);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return res;
+}
+
+template <ComparisonOperation op>
+inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    uint32x4x4_t out =
+    {
+        {
+            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
+        }
+    };
+    return out;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+{
+    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
+                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq(input1_ptr + x);
+        const auto b = wrapper::vloadq(input2_ptr + x);
+        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op>(a, b));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
+                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
+                                                float32x4_t voffseto, float32x4_t invvscaleo)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        // Get inputs and compute output
+        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
+        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op>(a, broadcast_value, reorder));
+    }
+    return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
+                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
+                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
+                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
+                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        auto       a    = wrapper::vloadq(input1_ptr + x);
+        auto       b    = wrapper::vloadq(input2_ptr + x);
+        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        a               = wrapper::vloadq(input1_ptr + x + 4);
+        b               = wrapper::vloadq(input2_ptr + x + 4);
+        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
+    }
+    if(x <= window_end_x - 4)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+        for(int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
+                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
+                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
+                                              float32x4_t voffseto, float32x4_t invvscaleo)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
+    }
+    return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
+    }
+    if(x <= window_end_x - 4)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for(int i = 0; i < 4; i++)
+        {
+            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
+        }
+        x = +4;
+    }
+    return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
+                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
+                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        store_quantized(output_ptr + x, rf);
+    }
+    return x;
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
+            for(; x < window_end_x; ++x)
+            {
+                const auto a      = *(non_broadcast_input_ptr + x);
+                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+            for(; x < window_end_x; ++x)
+            {
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = (*scalar_func)(a, b);
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                              uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+                              int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+                                                    float32x4_t, float32x4_t, const bool),
+                              int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
+                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+                                               float32x4_t, float32x4_t))
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    const float output_scale  = out->info()->quantization_info().scale;
+    const int   output_offset = out->info()->quantization_info().offset;
+
+    // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
+    const float32x4_t voffseto   = vdupq_n_f32(output_offset + 0.5f);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+
+    if(is_broadcast_across_x)
+    {
+        // Select the broadcast input on the X axis
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        const QuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info();
+        const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+
+        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+            const float32x4x4_t broadcast_vector = dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
+
+            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
+                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale, non_broadcast_qinfo.offset);
+                const float bfs   = scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset);
+                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
+                                                   out->info()->quantization_info());
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Input1 quantization info
+        const int32x4_t   voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
+        const float32x4_t vscale1  = vdupq_n_f32(in1->info()->quantization_info().scale);
+
+        // Input2 quantization info
+        const int32x4_t   voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
+        const float32x4_t vscale2  = vdupq_n_f32(in2->info()->quantization_info().scale);
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+        const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
+                                 vscale1, vscale2, voffseto, invvscaleo);
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset);
+                const float bfs   = scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset);
+                *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info());
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<ScalarType, ScalarType, VectorType>(in1, in2, out, window,
+                                                       &elementwise_arithm_op_scalar<op, ScalarType>,
+                                                       &elementwise_arithm_op_broadcast_loop<op, ScalarType, VectorType>,
+                                                       &elementwise_arithm_op_loop<op, ScalarType, VectorType>);
+}
+
+template <ArithmeticOperation op>
+void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
+                             &elementwise_arithm_op_quantized_broadcast_loop<op>,
+                             &elementwise_arithm_op_quantized_loop<op>);
+}
+
+template <ComparisonOperation op>
+void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+                             &elementwise_comp_op_quantized_broadcast_loop<op>,
+                             &elementwise_comp_op_quantized_loop<op>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_func(const ITensor *input1, const ITensor *input2, ITensor *output,
+               std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+    std::string function_to_call("op_");
+    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        auto func = it->second;
+        return [func](const ITensor * input1, const ITensor * input2, ITensor * output, const Window & window)
+        {
+            func(input1, input2, output, window);
+        };
+    }
+    return nullptr;
+}
+
+template <ArithmeticOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_arithm_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
+    {
+        { "op_F32_F32_F32", &elementwise_arithm_op<op, float, float32x4_t> },
+        { "op_S16_S16_S16", &elementwise_arithm_op<op, int16_t, int16x8_t> },
+        { "op_S32_S32_S32", &elementwise_arithm_op<op, int32_t, int32x4_t> },
+        { "op_QASYMM8_QASYMM8_QASYMM8", &elementwise_arithm_op_quantized<op> }
+    };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    map_function["op_F16_F16_F16"] = &elementwise_arithm_op<op, float16_t, float16x8_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+    return configure_func(input1, input2, output, map_function);
+}
+
+template <ComparisonOperation op>
+std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)>
+configure_comp_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
+    {
+        { "op_F32_F32_U8", &elementwise_comp_op_32<op, float, float32x4_t> },
+        { "op_S16_S16_U8", &elementwise_comp_op_16<op, int16_t, int16x8_t> },
+        { "op_S32_S32_U8", &elementwise_comp_op_32<op, int32_t, int32x4_t> },
+        { "op_QASYMM8_QASYMM8_U8", &elementwise_comp_op_quantized<op> }
+    };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    map_function["op_F16_F16_U8"] = &elementwise_comp_op_16<op, float16_t, float16x8_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+    return configure_func(input1, input2, output, map_function);
+}
+} // namespace
+
+NEElementwiseOperationKernel::NEElementwiseOperationKernel()
+    : _function(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+    }
+
+    return Status{};
+}
+
+void NEElementwiseOperationKernel::configure_common(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Configure kernel window
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    // Auto initialize output if not initialized
+    auto_init_if_empty(*output->info(), out_shape, 1, input1->info()->data_type());
+
+    Window win = calculate_max_window(valid_region);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    INEKernel::configure(win);
+}
+
+void NEElementwiseOperationKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info, window);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_function == nullptr);
+    _function(_input1, _input2, _output, window);
+}
+
+/** Arithmetic operators (min, max, squared_diff) */
+
+void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    configure_common(input1, input2, output);
+    switch(op)
+    {
+        case ArithmeticOperation::MAX:
+            _function = configure_arithm_func<ArithmeticOperation::MAX>(input1, input2, output);
+            break;
+        case ArithmeticOperation::MIN:
+            _function = configure_arithm_func<ArithmeticOperation::MIN>(input1, input2, output);
+            break;
+        case ArithmeticOperation::SQUARED_DIFF:
+            _function = configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(input1, input2, output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+Status NEArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+    }
+    return validate_arguments_common(input1, input2, output);
+}
+
+Status NEArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    return Status{};
+}
+
+/** The division operator */
+
+void NEDivisionOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    configure_common(input1, input2, output);
+    _function = configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
+}
+
+Status NEDivisionOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
+    return NEArithmeticOperationKernel::validate_arguments(input1, input2, output);
+}
+
+Status NEDivisionOperationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    return Status{};
+}
+
+/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
+
+void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    configure_common(input1, input2, output);
+    switch(op)
+    {
+        case ComparisonOperation::Equal:
+            _function = configure_comp_func<ComparisonOperation::Equal>(input1, input2, output);
+            break;
+        case ComparisonOperation::NotEqual:
+            _function = configure_comp_func<ComparisonOperation::NotEqual>(input1, input2, output);
+            break;
+        case ComparisonOperation::Greater:
+            _function = configure_comp_func<ComparisonOperation::Greater>(input1, input2, output);
+            break;
+        case ComparisonOperation::GreaterEqual:
+            _function = configure_comp_func<ComparisonOperation::GreaterEqual>(input1, input2, output);
+            break;
+        case ComparisonOperation::Less:
+            _function = configure_comp_func<ComparisonOperation::Less>(input1, input2, output);
+            break;
+        case ComparisonOperation::LessEqual:
+            _function = configure_comp_func<ComparisonOperation::LessEqual>(input1, input2, output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+Status NEComparisonOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
+    }
+    return validate_arguments_common(input1, input2, output);
+}
+
+Status NEComparisonOperationKernel::validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    return Status{};
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
new file mode 100644
index 0000000..7ecc4d1
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp

@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+template <ElementWiseUnary op, typename ScalarType>
+inline ScalarType elementwise_op_scalar(const ScalarType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return 1 / sqrt(a);
+        case ElementWiseUnary::EXP:
+            return std::exp(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <ElementWiseUnary op, typename VectorType>
+inline VectorType elementwise_op(const VectorType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return wrapper::vinvsqrt(a);
+        case ElementWiseUnary::EXP:
+            return wrapper::vexpq(a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+template <ElementWiseUnary op, typename ScalarType>
+void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+{
+    const int  window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+
+        int x = window_start_x;
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(output_ptr + x, elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+        }
+    },
+    input, output);
+}
+
+template <ElementWiseUnary op>
+std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+configure_func(const ITensor *input, ITensor *output)
+{
+    std::string function_to_call("op_");
+    function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    static std::map<std::string, NEElementwiseUnaryKernel::ElementwiseUnaryFunction *> map_function =
+    {
+        { "op_F32_F32", &elementwise_op<op, float> }
+    };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        auto func = it->second;
+        return [func](const ITensor * input, ITensor * output, const Window & window)
+        {
+            func(input, output, window);
+        };
+    }
+    return nullptr;
+}
+} // namespace
+
+NEElementwiseUnaryKernel::NEElementwiseUnaryKernel()
+    : _function(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NEElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Configure kernel window
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    // Auto initialize output if not initialized
+    auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+    Window win = calculate_max_window(valid_region);
+
+    _input  = input;
+    _output = output;
+
+    INEKernel::configure(win);
+
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            _function = configure_func<ElementWiseUnary::RSQRT>(input, output);
+            break;
+        case ElementWiseUnary::EXP:
+            _function = configure_func<ElementWiseUnary::EXP>(input, output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+}
+
+Status NEElementwiseUnaryKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+    }
+
+    return Status{};
+}
+
+Status NEElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+    return Status{};
+}
+
+void NEElementwiseUnaryKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_function == nullptr);
+    _function(_input, _output, window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index aef4d48..f4046e0 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,15 +34,12 @@
 #include <algorithm>
 #include <cstdint>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
+class Coordinates;
 namespace
 {
-template <typename T, unsigned int leftx, unsigned int rightx>
-void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value);
-
-template <>
-inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
 {
     float border_value;
     constant_border_value.get(border_value);
@@ -93,11 +90,6 @@
 }
 } // namespace
 
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
 NEFillBorderKernel::NEFillBorderKernel()
     : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(static_cast<float>(0.f))
 {
@@ -142,81 +134,19 @@
     {
         case BorderMode::CONSTANT:
         {
-            switch(_tensor->info()->data_type())
+            if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
             {
-                case DataType::QASYMM8:
-                case DataType::U8:
-                    fill_constant_value_single_channel<uint8_t>(window);
-                    break;
-                case DataType::S8:
-                    fill_constant_value_single_channel<int8_t>(window);
-                    break;
-                case DataType::U16:
-                    fill_constant_value_single_channel<uint16_t>(window);
-                    break;
-                case DataType::S16:
-                    fill_constant_value_single_channel<int16_t>(window);
-                    break;
-                case DataType::U32:
-                    fill_constant_value_single_channel<uint32_t>(window);
-                    break;
-                case DataType::S32:
-                    fill_constant_value_single_channel<int32_t>(window);
-                    break;
-                case DataType::F16:
-                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
-                    fill_constant_value_single_channel<half>(window);
-                    break;
-                case DataType::F32:
-                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
-                    if(_border_size.left == 1 && _border_size.top == 1)
-                    {
-                        fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
-                    }
-                    else
-                    {
-                        fill_constant_value_single_channel<float>(window);
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not handled");
+                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+            }
+            else
+            {
+                fill_constant_value_single_channel(window);
             }
             break;
         }
         case BorderMode::REPLICATE:
         {
-            switch(_tensor->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                case DataType::U8:
-                    fill_replicate_single_channel<uint8_t>(window);
-                    break;
-                case DataType::S8:
-                    fill_replicate_single_channel<int8_t>(window);
-                    break;
-                case DataType::U16:
-                    fill_replicate_single_channel<uint16_t>(window);
-                    break;
-                case DataType::S16:
-                    fill_replicate_single_channel<int16_t>(window);
-                    break;
-                case DataType::U32:
-                    fill_replicate_single_channel<uint32_t>(window);
-                    break;
-                case DataType::S32:
-                    fill_replicate_single_channel<int32_t>(window);
-                    break;
-                case DataType::F16:
-                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
-                    fill_replicate_single_channel<half>(window);
-                    break;
-                case DataType::F32:
-                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
-                    fill_replicate_single_channel<float>(window);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not handled");
-            }
+            fill_replicate_single_channel(window);
             break;
         }
         case BorderMode::UNDEFINED:
@@ -226,13 +156,12 @@
     }
 }
 
-template <typename T>
 void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 {
     uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
     const size_t   width              = _tensor->info()->valid_region().shape[0];
     const size_t   height             = _tensor->info()->valid_region().shape[1];
-
+    const size_t   element_size       = _tensor->info()->element_size();
     // Left and right border
     Window vertical(window);
     vertical.set(Window::DimY, Window::Dimension(0, height, 1));
@@ -241,72 +170,18 @@
 
     execute_window_loop(vertical, [&](const Coordinates & id)
     {
-        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
-        const auto left_val  = *reinterpret_cast<T *>(vertical_it.ptr());
-        const auto right_val = *(reinterpret_cast<T *>(vertical_it.ptr()) + width - 1);
-
+        uint8_t *base_addr = start_valid_region + vertical_it.offset();
         // Fill left and right borders
-        std::fill_n(row_start - _border_size.left, _border_size.left, left_val);
-        std::fill_n(row_start + width, _border_size.right, right_val);
-    },
-    vertical_it);
-
-    // Top and bottom border
-    Iterator plane_it(_tensor, window);
-
-    // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto first_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset());
-
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+        for(unsigned int i = 0; i < _border_size.left; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
-
-            // Copy top rows including left/right borders
-            std::copy_n(first_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
         }
 
-        const auto last_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + (height - 1) * _tensor->info()->strides_in_bytes()[1]);
-
-        // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        for(unsigned int i = 0; i < _border_size.right; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
-
-            // Copy bottom rows including left/right borders
-            std::copy_n(last_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+            std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
         }
     },
-    plane_it);
-}
-
-template <typename T>
-void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
-{
-    T constant_border_value;
-    _constant_border_value.get(constant_border_value);
-
-    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
-    const size_t   width              = _tensor->info()->valid_region().shape[0];
-    const size_t   height             = _tensor->info()->valid_region().shape[1];
-    const int      stridey            = _tensor->info()->strides_in_bytes()[1];
-
-    // Left and right border
-    Window vertical(window);
-    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
-
-    Iterator vertical_it(_tensor, vertical);
-
-    execute_window_loop(vertical, [&](const Coordinates & id)
-    {
-        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
-
-        // Fill left and right borders
-        std::fill_n(row_start - _border_size.left, _border_size.left, constant_border_value);
-        std::fill_n(row_start + width, _border_size.right, constant_border_value);
-    },
     vertical_it);
 
     // Top and bottom border
@@ -319,21 +194,80 @@
         // Top border
         for(int i = -_border_size.top; i < 0; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
+            // Copy top rows including left/right borders
+            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                        base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
+        }
 
+        // Bottom border
+        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        {
+            // Copy bottom rows including left/right borders
+            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                        base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
+        }
+    },
+    plane_it);
+}
+
+void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
+{
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const size_t   width              = _tensor->info()->valid_region().shape[0];
+    const size_t   height             = _tensor->info()->valid_region().shape[1];
+    const int      stridey            = _tensor->info()->strides_in_bytes()[1];
+    const size_t   element_size       = _tensor->info()->element_size();
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        uint8_t *base_addr = start_valid_region + vertical_it.offset();
+        // Fill left and right borders
+        for(unsigned int i = 0; i < _border_size.left; ++i)
+        {
+            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
+        }
+
+        for(unsigned int i = 0; i < _border_size.right; ++i)
+        {
+            std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+        }
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(_tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8_t *base_addr = start_valid_region + plane_it.offset();
+        // Top border
+        for(int i = -_border_size.top; i < 0; ++i)
+        {
             // Fill top rows including left/right borders
-            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            {
+                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+            }
         }
 
         // Bottom border
         const unsigned low_border_size = height + _border_size.bottom;
         for(unsigned int i = height; i < low_border_size; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
-
             // Fill bottom rows including left/right borders
-            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            {
+                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+            }
         }
     },
     plane_it);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index b8452fb..4840a95 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,7 @@
                                                          DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     // Checks performed when output is configured
@@ -55,6 +56,7 @@
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
new file mode 100644
index 0000000..e699bac
--- /dev/null
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp

@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "utils/TypePrinter.h"
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                          float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(conv_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(conv_weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_mean, bn_var);
+
+    unsigned int kernels_idx = get_data_layout_dimension_index(conv_weights->data_layout(), DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_weights->dimension(kernels_idx) != bn_mean->dimension(0));
+
+    // Validate bias
+    if(conv_bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, conv_bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, conv_bias);
+    }
+    // Validate beta
+    if(bn_beta != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_beta);
+    }
+    // Validate gamma
+    if(bn_gamma != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_gamma);
+    }
+
+    // Validate output weights
+    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(conv_weights, fused_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(conv_weights, fused_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_weights);
+    }
+    // Validate output bias
+    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_bias);
+    }
+
+    return Status{};
+}
+
+template <typename ScalarType, int size>
+void fused_batch_normmalization(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
+                                const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
+
+    const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+    const bool run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+    // Set build options
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x  = size;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Iterator conv_w_in(conv_weights, win);
+    Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
+
+    const auto conv_bias_in  = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto       conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+    auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       gamma_vec   = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+    auto       beta_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    auto       rvar_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+    const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+    auto mean                = ScalarType(0.0);
+    auto var                 = ScalarType(0.0);
+    auto gamma               = ScalarType(1.0);
+    auto beta                = ScalarType(0.0);
+    auto conv_bias_in_scalar = ScalarType(0.0);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        if(slice != id[3])
+        {
+            slice = id[3];
+            mean  = input_mean[slice];
+            var   = input_var[slice];
+            gamma = ScalarType(1.0);
+            beta  = ScalarType(0.0);
+
+            // Construct vectors
+            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+            var_vec  = wrapper::vdup_n(var, ExactTagType{});
+            if(input_gamma != nullptr)
+            {
+                gamma     = input_gamma[slice];
+                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+            }
+            if(input_beta != nullptr)
+            {
+                beta     = input_beta[slice];
+                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+            }
+            if(conv_bias_in != nullptr)
+            {
+                conv_bias_in_scalar = conv_bias_in[slice];
+            }
+            else
+            {
+                conv_bias_in_scalar = ScalarType(0);
+            }
+
+            conv_bias_in_scalar  = (conv_bias_in_scalar - mean) / sqrt(var + ScalarType(epsilon));
+            conv_bias_in_scalar  = (conv_bias_in_scalar * gamma) + beta;
+            conv_bias_out[slice] = conv_bias_in_scalar;
+            rvar_vec             = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+        }
+
+        int  x              = window_start_x;
+        auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+        auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+            wn      = wrapper::vmul(wn, rvar_vec);
+            wn      = wrapper::vmul(wn, gamma_vec);
+
+            // Store results
+            wrapper::vstore(conv_w_out_ptr + x, wn);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / sqrt(var + ScalarType(epsilon)) * gamma;
+        }
+    },
+    conv_w_in, conv_w_out);
+}
+} // namespace
+
+NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
+    : _conv_weights(nullptr), _conv_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
+      _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+{
+}
+
+void NEFuseBatchNormalizationKernel::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+                                               ITensor *fused_weights, ITensor *fused_bias,
+                                               const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+                                               float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(conv_weights, bn_mean, bn_var);
+
+    _conv_weights  = conv_weights;
+    _conv_bias     = conv_bias;
+    _bn_mean       = bn_mean;
+    _bn_var        = bn_var;
+    _bn_beta       = bn_beta;
+    _bn_gamma      = bn_gamma;
+    _fused_weights = fused_weights;
+    _fused_bias    = fused_bias;
+    _epsilon       = epsilon;
+
+    _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+    _run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+    // Auto initialize outputs
+    if(_fused_weights != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*_fused_weights->info(), *_conv_weights->info()->clone());
+        fused_weights->info()->set_valid_region(conv_weights->info()->valid_region());
+    }
+    if(_fused_bias != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
+        _fused_bias->info()->set_valid_region(bn_mean->info()->valid_region());
+    }
+
+    // Validate arguments
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(conv_weights->info(), bn_mean->info(), bn_var->info(),
+                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
+                                                  (conv_bias != nullptr) ? conv_bias->info() : nullptr,
+                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
+                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
+                                                  epsilon));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*conv_weights->info());
+    INEKernel::configure(win);
+
+    // Configure function to run based on different data types
+    const DataType data_type = _conv_weights->info()->data_type();
+    switch(data_type)
+    {
+        case DataType::F32:
+            _func = &fused_batch_normmalization<float, 4>;
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &fused_batch_normmalization<float16_t, 8>;
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        default:
+            ARM_COMPUTE_ERROR("Not Supported");
+            break;
+    }
+}
+
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                                                const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon));
+    return Status{};
+}
+
+void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    (*_func)(_conv_weights, _conv_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 5483602..7769d9e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,7 @@
         output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 024c4f8..f0ac695 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,37 +66,20 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    // Note: This kernel performs 16 elements per iteration.
-    // However, since we use a left-over for loop, we cannot have any read or write out of memory
-    // For this reason num_elems_processed_per_iteration is set to 1
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input, Steps());
 
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    // NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    bool window_changed = update_window_and_padding(win,
-                                                    input_access);
-
-    if(output->total_size() != 0)
-    {
-        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } // namespace
 
@@ -269,7 +252,7 @@
     _max                          = max;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
@@ -282,10 +265,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
-                                                              output->clone().get())
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }

diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 2387869..cba3390 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -43,11 +44,12 @@
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type()));
 
     ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
@@ -87,6 +89,48 @@
 
 namespace arm_compute
 {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
+                                                                                const Window &window_w,
+                                                                                const Window &window_out)
+{
+    Iterator in(_input0, window_in);
+    Iterator in2(_input1, window_w);
+    Iterator out(_output, window_out);
+
+    const int input_w          = _input0->info()->dimension(0);
+    const int input_h          = _input0->info()->dimension(1);
+    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
+    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+    const int output_stride_x  = _output->info()->strides_in_bytes().x();
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        // Get pointers
+        const uint8_t *const input_ptr   = in.ptr();
+        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+        auto                 output_ptr  = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+        float16x8_t row_dot = vdupq_n_f16(0.f);
+        for(int i = 0; i < input_w; i += 8)
+        {
+            const auto input   = vld1q_f16(reinterpret_cast<const __fp16 *>(input_ptr + i * input_stride_x));
+            const auto weights = vld1q_f16(reinterpret_cast<const __fp16 *>(weights_ptr + i * weights_stride_x));
+            row_dot            = vaddq_f16(row_dot, vmulq_f16(input, weights));
+        }
+
+        auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot));
+        temp      = vpadd_f16(temp, temp);
+        temp      = vpadd_f16(temp, temp);
+
+        *output_ptr = vget_lane_f16(temp, 0);
+    },
+    in, in2, out);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 template <>
 void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
                                                                                    const Window &window_w,
@@ -226,6 +270,11 @@
         case DataType::QASYMM8:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
             break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
             break;

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 2e14e7a..38503b7 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
new file mode 100644
index 0000000..1e027b7
--- /dev/null
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp

@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U>
+void validate_indices(const ITensor *indices)
+{
+    for(size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+    }
+}
+
+} // namespace
+
+NEGatherKernel::NEGatherKernel()
+    : _input{}, _indices{}, _axis{}, _output{}, _func{}
+{
+}
+
+template <typename U>
+inline void NEGatherKernel::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    // Validate that the indices are not negative
+    validate_indices<U>(_indices);
+
+    Iterator output_it(_output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        Coordinates gather_id(id);
+
+        auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+        gather_id.set(0, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), output_it.ptr());
+    },
+    output_it);
+}
+
+template <typename U>
+void NEGatherKernel::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    // Validate that the indices are not negative
+    validate_indices<U>(_indices);
+
+    Window output_window{ window };
+    output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator output_it(_output, output_window);
+    execute_window_loop(output_window, [&](const Coordinates & id)
+    {
+        Coordinates gather_id(id);
+
+        auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+        gather_id.set(_axis, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
+    },
+    output_it);
+}
+
+void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() != 1);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+    _input   = input;
+    _indices = indices;
+    _output  = output;
+    _axis    = axis;
+
+    if(_axis < 0)
+    {
+        _axis += input->info()->num_dimensions();
+    }
+    ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+    if(0 == _axis)
+    {
+        switch(_indices->info()->data_type())
+        {
+            case DataType::U32:
+                _func = &NEGatherKernel::gather_0_axis<uint32_t>;
+                break;
+            case DataType::S32:
+                _func = &NEGatherKernel::gather_0_axis<int32_t>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+        }
+    }
+    else
+    {
+        switch(_indices->info()->data_type())
+        {
+            case DataType::U32:
+                _func = &NEGatherKernel::gather_n_axis<uint32_t>;
+                break;
+            case DataType::S32:
+                _func = &NEGatherKernel::gather_n_axis<int32_t>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+        }
+    }
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+    // Create window
+    Window win = calculate_max_window(*output->info(), Steps());
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    if(axis < 0)
+    {
+        axis += input->num_dimensions();
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+    return Status{};
+}
+
+void NEGatherKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window, info);
+}
+
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 2c51eae..2e3d9de 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,7 @@
         TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index ed03783..cda041d 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp

@@ -32,15 +32,20 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cmath>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
+template <typename T, int S>
 void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
 {
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
     Window window_sum(window);
     window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
 
@@ -53,30 +58,97 @@
         Iterator sum_it(sum, sum_slice);
         Iterator output_it(out, in_slice);
 
-        const float       sum_value           = *reinterpret_cast<const float *>(sum_it.ptr());
-        const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+        const auto sum_value           = *reinterpret_cast<const T *>(sum_it.ptr());
+        const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            const auto in_ptr  = reinterpret_cast<const float *>(input_it.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-            vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
         },
         input_it, output_it);
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
 
+template <typename T, int S>
+void l2_normalize_Y(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window window_sum(window);
+    window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window sum_slice = window_sum.first_slice_window_2D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+        },
+        input_it, sum_it, output_it);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+}
+
+template <typename T, int S>
+void l2_normalize_Z(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window window_sum(window);
+    window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window sum_slice = window_sum.first_slice_window_3D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+        },
+        input_it, sum_it, output_it);
+    }
+    while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
@@ -89,7 +161,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     }
 
     return Status{};
@@ -158,9 +230,52 @@
     switch(_axis)
     {
         case 0:
-            l2_normalize_X(_input, _sum, _output, _epsilon, window);
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+                    break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        case 1:
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_Y<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_Y<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        case 2:
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_Z<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_Z<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+                    break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
new file mode 100644
index 0000000..2b57b15
--- /dev/null
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+NEMemsetKernel::NEMemsetKernel()
+    : _tensor(nullptr), _constant_value()
+{
+}
+
+void NEMemsetKernel::configure(ITensor *tensor, const PixelValue &constant_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    _tensor         = tensor;
+    _constant_value = constant_value;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*tensor->info(), Steps());
+    INEKernel::configure(win);
+}
+
+void NEMemsetKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Collapse all the batches on the third dimension
+    bool   has_collapsed = true;
+    Window collapsed     = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
+    ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const auto     window_width       = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
+    const size_t   element_size       = _tensor->info()->element_size();
+
+    // Unroll X dimension
+    collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator tensor_it(_tensor, collapsed);
+    execute_window_loop(collapsed, [&](const Coordinates & id)
+    {
+        uint8_t *base_addr = start_valid_region + tensor_it.offset();
+        // Set memory
+        for(int i = 0; i < window_width; ++i)
+        {
+            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+        }
+
+    },
+    tensor_it);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 27af121..e5f6e4f 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -44,8 +45,6 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
-                                    "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
@@ -55,6 +54,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     }
 
     return Status{};
@@ -143,16 +143,26 @@
                 {
                     if(norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, false>;
                     }
                     break;
                 }
+                case 1:
+                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
+                    }
+                    else
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, false>;
+                    }
+                    break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
+                    _func = &NENormalizationLayerKernel::normalize_float<float, 4, 2, false>;
                     break;
                 default:
                     break;
@@ -168,16 +178,26 @@
                 {
                     if(norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, false>;
                     }
                     break;
                 }
+                case 1:
+                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
+                    }
+                    else
+                    {
+                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, false>;
+                    }
+                    break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
+                    _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 2, false>;
                     break;
                 default:
                     break;
@@ -195,14 +215,17 @@
     INEKernel::configure(win_config.second);
 }
 
-template <DataType dt, unsigned int dim, bool do_2D_norm>
+template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
 void NENormalizationLayerKernel::normalize_float(const Window &window)
 {
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
     Iterator input(_input, window);
     Iterator input_squared(_input_squared, window);
     Iterator output(_output, window);
 
-    const int dim_y                = 1;
+    const int dim_y                = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
     const int radius               = _norm_info.norm_size() / 2;
     const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
     // We account padding across X only and we iterate over rows
@@ -210,83 +233,39 @@
     const int max_right  = _input->info()->dimension(dim) - 1;
     const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
-    if(dt == DataType::F32)
-    {
-        const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
-        const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
-        const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+    const auto coeff_vec = wrapper::vdup_n(static_cast<T>(_norm_info.scale_coeff()), ExactTagType{});
+    const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
+    const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
 
-        execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+        for(int j = first_row; j <= last_row; j++)
         {
-            // Get range to normalize
-            const int current_row   = do_2D_norm ? id[dim_y] : 0;
-            const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-            const int first_slice   = std::max(current_slice - radius, min_left);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            // Accumulate 2D In-Map values
-            float32x4_t accu = vdupq_n_f32(0.f);
-            for(int j = first_row; j <= last_row; j++)
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
             {
-                // Compute row displacement
-                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
-                }
+                accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + i * input_squared_stride)));
             }
+        }
 
-            // Normalize
-            const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
-            const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
-            vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(dt == DataType::F16)
-    {
-        const float16x8_t coeff_vec    = vdupq_n_f16(_norm_info.scale_coeff());
-        const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
-        const float16x8_t kappa_vec    = vdupq_n_f16(_norm_info.kappa());
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Get range to normalize
-            const int current_row   = do_2D_norm ? id[dim_y] : 0;
-            const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-            const int first_slice   = std::max(current_slice - radius, min_left);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            // Accumulate 2D In-Map values
-            float16x8_t accu = vdupq_n_f16(0.f);
-            for(int j = first_row; j <= last_row; j++)
-            {
-                // Compute row displacement
-                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(input_squared_ptr + i * input_squared_stride)));
-                }
-            }
-
-            const float16x8_t norm_f16         = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
-            const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
-            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
-        },
-        input, input_squared, output);
-    }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
+        // Normalize
+        const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+        const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(reinterpret_cast<const T *>(input.ptr())), wrapper::vinv(normalized));
+        wrapper::vstore(reinterpret_cast<T *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
 }
 
 Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)

diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 29e6d50..1df94ae 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,52 @@
 
 namespace
 {
+inline bool is_permutation_supported(const PermutationVector &v)
+{
+    static const std::array<PermutationVector, 6> permutations3 =
+    {
+        {
+            PermutationVector(2U, 0U, 1U),
+            PermutationVector(1U, 2U, 0U),
+            PermutationVector(0U, 1U, 2U),
+            PermutationVector(0U, 2U, 1U),
+            PermutationVector(1U, 0U, 2U),
+            PermutationVector(2U, 1U, 0U),
+        }
+    };
+    static const std::array<PermutationVector, 24> permutations4 =
+    {
+        {
+            PermutationVector(0U, 1U, 2U, 3U),
+            PermutationVector(1U, 0U, 2U, 3U),
+            PermutationVector(2U, 0U, 1U, 3U),
+            PermutationVector(0U, 2U, 1U, 3U),
+            PermutationVector(1U, 2U, 0U, 3U),
+            PermutationVector(2U, 1U, 0U, 3U),
+            PermutationVector(2U, 1U, 3U, 0U),
+            PermutationVector(1U, 2U, 3U, 0U),
+            PermutationVector(3U, 2U, 1U, 0U),
+            PermutationVector(2U, 3U, 1U, 0U),
+            PermutationVector(1U, 3U, 2U, 0U),
+            PermutationVector(3U, 1U, 2U, 0U),
+            PermutationVector(3U, 0U, 2U, 1U),
+            PermutationVector(0U, 3U, 2U, 1U),
+            PermutationVector(2U, 3U, 0U, 1U),
+            PermutationVector(3U, 2U, 0U, 1U),
+            PermutationVector(0U, 2U, 3U, 1U),
+            PermutationVector(2U, 0U, 3U, 1U),
+            PermutationVector(1U, 0U, 3U, 2U),
+            PermutationVector(0U, 1U, 3U, 2U),
+            PermutationVector(3U, 1U, 0U, 2U),
+            PermutationVector(1U, 3U, 0U, 2U),
+            PermutationVector(0U, 3U, 1U, 2U),
+            PermutationVector(3U, 0U, 1U, 2U)
+        }
+    };
+
+    return (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -50,9 +96,8 @@
                                                          DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
-                                    && (perm != PermutationVector{ 1U, 2U, 0U }),
-                                    "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
 
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
@@ -60,6 +105,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -70,12 +116,20 @@
 template <typename T>
 void NEPermuteKernel::run_permute(const Window &window)
 {
+    const DataLayout input_layout = _input->info()->data_layout();
+
     // Input window
     Window window_in = window;
-    window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-    window_in.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-    window_in.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
-    window_in.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+
+    // we only support these two configs in arm_compute/core/NEON/kernels/convolution/common/shims.hpp, for all others
+    // we have to fall back to C++
+    if((input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U }) || (input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U }))
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+        window_in.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+        window_in.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+        window_in.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+    }
 
     // Output window
     Window                  window_out(window);
@@ -89,23 +143,53 @@
     Iterator in(_input, window_in);
     Iterator out(_output, window_out);
 
-    // CHW -> HWC
-    if(_perm == PermutationVector{ 2U, 0U, 1U })
-    {
-        const int in_row_stride     = _input->info()->strides_in_bytes().y() / sizeof(T);
-        const int in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
-        const int in_batch_stride   = _input->info()->strides_in_bytes()[3] / sizeof(T);
+    int in_row_stride     = 0;
+    int in_col_stride     = 0;
+    int in_channel_stride = 0;
+    int in_batch_stride   = 0;
+    int n_cols            = 0;
+    int n_rows            = 0;
+    int n_channels        = 0;
+    int n_batches         = 0;
 
+    switch(input_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            in_row_stride     = _input->info()->strides_in_bytes().y() / sizeof(T);
+            in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
+            in_batch_stride   = _input->info()->strides_in_bytes()[3] / sizeof(T);
+            n_cols            = _input->info()->tensor_shape().x();
+            n_rows            = window_in.y().step();
+            n_channels        = _input->info()->tensor_shape().z();
+            n_batches         = _input->info()->tensor_shape()[3];
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            in_col_stride   = _input->info()->strides_in_bytes().y() / sizeof(T);
+            in_row_stride   = _input->info()->strides_in_bytes().z() / sizeof(T);
+            in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
+            n_channels      = _input->info()->tensor_shape().x();
+            n_cols          = window_in.y().step();
+            n_rows          = _input->info()->tensor_shape().z();
+            n_batches       = _input->info()->tensor_shape()[3];
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid input data layout.");
+            break;
+        }
+    }
+
+    // CHW -> HWC
+    if(input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U })
+    {
         const int out_channel_stride = _output->info()->strides_in_bytes().x() / sizeof(T);
         const int out_col_stride     = _output->info()->strides_in_bytes().y() / sizeof(T);
         const int out_row_stride     = _output->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = _output->info()->strides_in_bytes()[3] / sizeof(T);
-
-        const int n_cols     = _input->info()->tensor_shape().x();
-        const int n_rows     = window_in.y().step();
-        const int n_channels = _input->info()->tensor_shape().z();
-        const int n_batches  = _input->info()->tensor_shape()[3];
-
         execute_window_loop(window_in, [&](const Coordinates & id)
         {
             const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
@@ -117,22 +201,12 @@
         in, out);
     }
     // HWC -> CHW
-    else if(_perm == PermutationVector{ 1U, 2U, 0U })
+    else if(input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U })
     {
-        const int in_col_stride   = _input->info()->strides_in_bytes().y() / sizeof(T);
-        const int in_row_stride   = _input->info()->strides_in_bytes().z() / sizeof(T);
-        const int in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
-
         const int out_col_stride     = _output->info()->strides_in_bytes().x() / sizeof(T);
         const int out_row_stride     = _output->info()->strides_in_bytes().y() / sizeof(T);
         const int out_channel_stride = _output->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = _output->info()->strides_in_bytes()[3] / sizeof(T);
-
-        const int n_channels = _input->info()->tensor_shape().x();
-        const int n_cols     = window_in.y().step();
-        const int n_rows     = _input->info()->tensor_shape().z();
-        const int n_batches  = _input->info()->tensor_shape()[3];
-
         execute_window_loop(window_in, [&](const Coordinates & id)
         {
             const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
@@ -145,7 +219,18 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Unsupported permutation vector");
+        // All other cases fall back to C++
+        // Permute strides
+        Strides strides      = _output->info()->strides_in_bytes();
+        Strides perm_strides = strides;
+        permute_strides(perm_strides, _perm);
+        const int perm_stride_3 = _input->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+        },
+        in, out);
     }
 }
 

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 310560b..d00a4af 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,8 +52,7 @@
 
 namespace
 {
-template <bool exclude_padding, DataLayout data_layout>
-inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -72,8 +71,7 @@
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
-template <bool exclude_padding>
-inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
+inline void scale_vector_s16x8(bool exclude_padding, uint16x8_t &v, const Coordinates &id, int id_offset, int step,
                                const int pool_size, const int upper_bound_w, const int upper_bound_h,
                                const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
@@ -140,6 +138,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
                                     || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
     }
@@ -336,13 +335,9 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const PoolingType   pool_type         = pool_info.pool_type();
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
-    const bool          exclude_padding   = pool_info.exclude_padding();
     const bool          is_global_pooling = pool_info.is_global_pooling();
     const int           pool_stride_x     = pad_stride_info.stride().first;
-    unsigned int        pool_size_x       = 0;
-    unsigned int        pool_size_y       = 0;
 
     // Get data layout
     const DataLayout data_layout = input->info()->data_layout();
@@ -350,18 +345,19 @@
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
-    pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
+    const Size2D pool_size(
+        is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width,
+        is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height);
 
     // Validate pool info before calling scaled_dimensions
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
 
     // Check output dimensions
     unsigned int pooled_w, pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
                                                      input->info()->dimension(idx_height),
-                                                     pool_size_x,
-                                                     pool_size_y,
+                                                     pool_size.x(),
+                                                     pool_size.y(),
                                                      pad_stride_info);
 
     // Perform validation step
@@ -371,7 +367,7 @@
     _input     = input;
     _output    = output;
     _pool_info = pool_info;
-    _is_square = (pool_size_x == pool_size_y);
+    _is_square = (pool_size.x() == pool_size.y());
 
     // Get data type
     const DataType data_type = input->info()->data_type();
@@ -379,88 +375,37 @@
 
     if(data_type == DataType::QASYMM8)
     {
-        if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
+        if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
-        else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
+        else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
         else
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
     }
@@ -468,157 +413,56 @@
     {
         if(_is_square)
         {
-            switch(pool_size_x)
+            switch(pool_size.x())
             {
                 case 2:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
                     }
-                    break;
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+                    }
+                }
+                break;
                 case 3:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
                     }
-                    break;
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+                    }
+                }
+                break;
                 default:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
                     }
                     break;
+                }
+                break;
             }
         }
         else
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::L2:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
             }
         }
     }
@@ -626,206 +470,78 @@
     {
         if(_is_square)
         {
-            switch(pool_size_x)
+            switch(pool_size.x())
             {
                 case 2:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
+                }
                 case 3:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
+                }
                 case 7:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
+                }
                 default:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
+                }
             }
         }
         else
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::L2:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
             }
         }
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -879,9 +595,9 @@
             uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
 
             // Scale lower result
-            scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
-                                                pool_size, upper_bound_w, upper_bound_h,
-                                                pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            scale_vector_s16x8(exclude_padding, res_lower, id, 0, scale_step_x,
+                               pool_size, upper_bound_w, upper_bound_h,
+                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             lower_res = vmovn_u16(res_lower);
 
             // Compute upper result for stride_x == 1
@@ -907,9 +623,9 @@
                 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
 
                 // Scale lower result
-                scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                scale_vector_s16x8(exclude_padding, res_upper, id, 1, 2,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 upper_res = vmovn_u16(res_upper);
             }
         }
@@ -938,9 +654,10 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -978,7 +695,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -1008,9 +725,10 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator      input(_input, window_input);
     Iterator      output(_output, window);
@@ -1042,7 +760,7 @@
 
         if(pooling_type != PoolingType::MAX)
         {
-            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
 
             const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
@@ -1071,71 +789,7 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr int pool_size       = 2;
-    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            top_data    = vmul_f32(top_data, top_data);
-            bottom_data = vmul_f32(bottom_data, bottom_data);
-        }
-
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
-        }
-        else
-        {
-            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-            res                        = vpmax_f32(max_data, max_data);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1212,21 +866,21 @@
                     vgetq_lane_u16(final_sum.val[1], 6),
                 };
 
-                scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                scale_vector_s16x8(exclude_padding, res, id, 0, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
             }
             else
             {
                 // Scale lower result
-                scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                scale_vector_s16x8(exclude_padding, final_sum.val[0], id, 0, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 // Scale lower result
-                scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
                 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
             }
@@ -1254,160 +908,10 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
-        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
-    {
-        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
-    }
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t res       = {};
-        float       final_res = 0.f;
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
-            // Get power of 2 in case of l2 pooling
-            if(pooling_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
-                // Get power of 2 in case of l2 pooling
-                if(pooling_type == PoolingType::L2)
-                {
-                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-                }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
-            }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
-                max_data                 = vmax2q_f32(max_data, data);
-            }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
-{
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1432,7 +936,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
 
@@ -1528,9 +1032,10 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1564,8 +1069,8 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
 
             // Perform pooling
@@ -1625,8 +1130,7 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1650,7 +1154,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1748,8 +1252,218 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size       = 2;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int           pool_stride_x   = 0;
+    int           pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f32(top_data, top_data);
+            bottom_data = vmul_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+        }
+        else
+        {
+            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+            res                        = vpmax_f32(max_data, max_data);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size       = 3;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmulq_f32(top_data, top_data);
+            middle_data = vmulq_f32(middle_data, middle_data);
+            bottom_data = vmulq_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+            res                        = vpmax_f32(res, res);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size       = 7;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+    for(int i = 0; i < pool_size; ++i)
+    {
+        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+    }
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t res       = {};
+        float       final_res = 0.f;
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+            }
+            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                // Get power of 2 in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                }
+                sum_data = vaddq_f32(sum_data, data.val[0]);
+                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+            }
+            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+            res = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                max_data                 = vmax2q_f32(max_data, data);
+            }
+            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
+            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
+            res = vpmax_f32(res, res);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1783,8 +1497,8 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float32x4_t scale_v = vdupq_n_f32(scale);
 
             // Perform pooling
@@ -1837,8 +1551,7 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1865,7 +1578,7 @@
             uint32_t   sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             for(int y = 0; y < pool_size_y; ++y)
@@ -1933,8 +1646,7 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1973,8 +1685,8 @@
             uint32x4_t vres4 = vdupq_n_u32(0);
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float32x4_t scale_v = vdupq_n_f32(scale);
 
             // Perform pooling
@@ -2073,9 +1785,10 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
-    const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size().width;
+    const unsigned int pool_stride_x   = _pool_info.pad_stride_info().stride().first;
+    const unsigned int pool_stride_y   = _pool_info.pad_stride_info().stride().second;
+    const unsigned int pool_size       = _pool_info.pool_size().width;
+    const bool         exclude_padding = _pool_info.exclude_padding();
 
     Window window_input(window);
     if(_input->info()->data_layout() == DataLayout::NCHW)
@@ -2093,6 +1806,7 @@
                 }
                 break;
             }
+
             case DataType::F16:
             case DataType::F32:
             {
@@ -2115,5 +1829,5 @@
     }
 
     // Run function
-    (this->*_func)(window_input, window);
+    (this->*_func)(window_input, window, _pool_info.pool_type(), exclude_padding);
 }

diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 2f63179..365fc83 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp

@@ -67,8 +67,7 @@
 
     if(output != nullptr && output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
 
     return Status{};
@@ -76,29 +75,13 @@
 
 std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
 {
-    ARM_COMPUTE_UNUSED(input2);
+    ARM_COMPUTE_UNUSED(input1, input2);
 
-    Window win            = {};
-    bool   window_changed = false;
-    switch(input1->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const int          num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-            win                                                  = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, output_access);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            win = calculate_max_window(*output, Steps());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    };
+    const int              num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+    const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
+    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -106,11 +89,10 @@
 } // namespace
 
 NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
                                               const int height)
 {
@@ -119,49 +101,23 @@
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    switch(DL)
+    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+    if(_info.clip())
     {
-        case DataLayout::NCHW:
-        {
-            float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-            if(_info.clip())
-            {
-                static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
-                static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-                vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
-            }
-            vst1q_f32(out + offset, vec_elements);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
-            if(_info.clip())
-            {
-                xmin = std::min(std::max(xmin, 0.f), 1.f);
-                ymin = std::min(std::max(ymin, 0.f), 1.f);
-                xmax = std::min(std::max(xmax, 0.f), 1.f);
-                ymax = std::min(std::max(ymax, 0.f), 1.f);
-            }
-
-            *(out + output_offset * offset)       = xmin;
-            *(out + output_offset * (offset + 1)) = ymin;
-            *(out + output_offset * (offset + 2)) = xmax;
-            *(out + output_offset * (offset + 3)) = ymax;
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
+        static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+        static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+        vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
     }
+    vst1q_f32(out + offset, vec_elements);
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 {
     const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
 
-    const int width_idx  = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
-    const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout = _input1->info()->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     const int layer_width  = _input1->info()->dimension(width_idx);
     const int layer_height = _input1->info()->dimension(height_idx);
@@ -182,44 +138,17 @@
         step_y = static_cast<float>(img_height) / layer_height;
     }
 
-    Window slice = {};
-
-    switch(DL)
-    {
-        case DataLayout::NCHW:
-            slice = window.first_slice_window_2D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
-            break;
-        case DataLayout::NHWC:
-            slice = window.first_slice_window_3D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
-            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    Window slice = window.first_slice_window_2D();
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
     execute_window_loop(slice, [&](const Coordinates & id)
     {
         float center_x = 0;
         float center_y = 0;
-        int   idx      = 0;
-        switch(DL)
-        {
-            case DataLayout::NCHW:
-                idx      = id.x() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            case DataLayout::NHWC:
-                idx      = id.y() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
+        int   idx      = id.x() / (4 * num_priors);
+        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
 
         float box_width;
         float box_height;
@@ -231,7 +160,7 @@
             const float min_size = _info.min_sizes().at(i);
             box_width            = min_size;
             box_height           = min_size;
-            store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
             offset += 4;
 
             if(!_info.max_sizes().empty())
@@ -240,7 +169,7 @@
                 box_width            = std::sqrt(min_size * max_size);
                 box_height           = box_width;
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
 
@@ -255,50 +184,27 @@
                 box_width  = min_size * sqrt(ar);
                 box_height = min_size / sqrt(ar);
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
         }
 
         // set the variance
-        switch(DL)
+        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+        float32x4_t var;
+        if(_info.variances().size() == 1)
         {
-            case DataLayout::NCHW:
-            {
-                out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-                float32x4_t var;
-                if(_info.variances().size() == 1)
-                {
-                    var = vdupq_n_f32(_info.variances().at(0));
-                }
-                else
-                {
-                    const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-                    var                    = vars;
-                }
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    vst1q_f32(out + 4 * i, var);
-                }
-            }
-            break;
-            case DataLayout::NHWC:
-            {
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    const int  prior_offset = 4 * i;
-                    const bool single_var   = _info.variances().size() == 1;
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
-                }
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
+            var = vdupq_n_f32(_info.variances().at(0));
         }
-
+        else
+        {
+            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+            var                    = vars;
+        }
+        for(int i = 0; i < num_priors; ++i)
+        {
+            vst1q_f32(out + 4 * i, var);
+        }
     },
     output);
 }
@@ -314,22 +220,6 @@
     _info   = info;
     _output = output;
 
-    switch(input1->info()->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented.");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -350,9 +240,8 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     // Run function
-    (this->*_func)(window);
+    calculate_prior_boxes(window);
 }
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 4d908db..b8d20f6 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -35,22 +36,36 @@
 #include <cfloat>
 #include <cmath>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
 
-    // Output auto inizialitation if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+    //Validate arguments
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
+    ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+    if(output->info()->total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+    }
+
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -64,7 +79,7 @@
 
     // Configure kernel window
     Window window;
-    window.set(Window::DimX, Window::Dimension(0, rois->num_values()));
+    window.set(Window::DimX, Window::Dimension(0, rois->info()->dimension(1)));
     window.set(Window::DimY, Window::Dimension(0, 1));
 
     AccessWindowStatic input_access(input->info(),
@@ -85,6 +100,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    const size_t values_per_roi = _rois->info()->dimension(0);
+
     const int   roi_list_start = window.x().start();
     const int   roi_list_end   = window.x().end();
     const int   width          = _input->info()->dimension(Window::DimX);
@@ -94,16 +111,21 @@
     const int   pooled_h       = _pool_info.pooled_height();
     const float spatial_scale  = _pool_info.spatial_scale();
 
+    const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer());
+
     for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
-        const ROI &curr_roi = _rois->at(roi_indx);
+        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+        const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
+        const auto         y1        = rois_ptr[values_per_roi * roi_indx + 2];
+        const auto         x2        = rois_ptr[values_per_roi * roi_indx + 3];
+        const auto         y2        = rois_ptr[values_per_roi * roi_indx + 4];
 
         // Scale ROI
-        const int roi_batch    = curr_roi.batch_idx;
-        const int roi_anchor_x = support::cpp11::round(curr_roi.rect.x * spatial_scale);
-        const int roi_anchor_y = support::cpp11::round(curr_roi.rect.y * spatial_scale);
-        const int roi_width    = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
-        const int roi_height   = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
+        const int roi_anchor_x = support::cpp11::round(x1 * spatial_scale);
+        const int roi_anchor_y = support::cpp11::round(y1 * spatial_scale);
+        const int roi_width    = std::max(support::cpp11::round((x2 - x1) * spatial_scale), 1.f);
+        const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
         for(int fm = 0; fm < fms; ++fm)
@@ -146,3 +168,4 @@
         }
     }
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
new file mode 100644
index 0000000..189e77f
--- /dev/null
+++ b/src/core/NEON/kernels/NERangeKernel.cpp

@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void range_function(ITensor *output, float start, float step, const Window &window)
+{
+    const unsigned int num_elems_processed_per_iteration = 16 / sizeof(T);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+    Iterator output_it(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        for(unsigned int count = 0; count < num_elems_processed_per_iteration; ++count)
+        {
+            id_vec = wrapper::vsetlane(static_cast<T>(id.x() + count), id_vec, count);
+        }
+        // start + step * id
+        const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+        wrapper::vstore(out_ptr, res_vec);
+    },
+    output_it);
+}
+
+Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
+                                                         1,
+                                                         DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &output, const float start, const float end, const float step)
+{
+    const unsigned int num_elems_processed_per_iteration = 16 / output.element_size();
+
+    // Auto initialize output if not initialized
+    auto_init_if_empty(output, TensorShape(num_of_elements_in_range(start, end, step)), 1, output.data_type(), output.quantization_info());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), TensorShape(num_of_elements_in_range(start, end, step))));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NERangeKernel::NERangeKernel()
+    : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+{
+}
+
+void NERangeKernel::configure(ITensor *output, float start, float end, float step)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*(output->info()), start, end, step);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _start  = start;
+    _end    = end;
+    _step   = step;
+    _output = output;
+    switch(_output->info()->data_type())
+    {
+        case DataType::U8:
+            _func = &range_function<uint8_t>;
+            break;
+        case DataType::U16:
+            _func = &range_function<uint16_t>;
+            break;
+        case DataType::U32:
+            _func = &range_function<uint32_t>;
+            break;
+        case DataType::S8:
+            _func = &range_function<int8_t>;
+            break;
+        case DataType::S16:
+            _func = &range_function<int16_t>;
+            break;
+        case DataType::S32:
+            _func = &range_function<int32_t>;
+            break;
+        case DataType::F32:
+            _func = &range_function<float>;
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &range_function<float16_t>;
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+            break;
+    }
+
+    INEKernel::configure(win_config.second);
+}
+
+Status NERangeKernel::validate(const ITensorInfo *output, float start, float end, float step)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*output, start, end, step));
+    ARM_COMPUTE_RETURN_ON_ERROR((validate_and_configure_window(*(output->clone()), start, end, step)).first);
+
+    return Status{};
+}
+
+void NERangeKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_output, _start, _step, window);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 182e93d..84cb223 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -31,6 +32,7 @@
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
@@ -39,11 +41,232 @@
 {
 namespace
 {
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4_t mask{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask = wrapper::vclt(b, a);
+    }
+
+    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+    if(axis != 0)
+    {
+        vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+
+    return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x4_t mask{ { 0 } };
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u8 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u8 = wrapper::vclt(b, a);
+    }
+    auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    mask.val[0]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    mask.val[1]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    mask.val[2]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    mask.val[3]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 },
+            { idx + 8, idx + 9, idx + 10, idx + 11 },
+            { idx + 12, idx + 13, idx + 14, idx + 15 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res =
+    {
+        {
+            vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+            vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+            vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+            vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+        }
+    };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin    = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin         = wrapper::vpmin(pmin, pmin);
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+    else
+    {
+        auto pmax    = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax         = wrapper::vpmax(pmax, pmax);
+        auto mask    = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+
+    res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+    auto pmin    = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+    pmin         = wrapper::vpmin(pmin, pmin);
+    uint32_t res = wrapper::vgetlane(pmin, 0);
+
+    return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x4_t res_idx_mask{ { 0 } };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+    res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+    res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+    res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 4);
+
+    return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x2_t mask{ 0 };
+    uint16x8_t   mask_u16{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u16 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u16 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+                         0, 0
+                       };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint16x8_t   mask_u16;
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 2);
+
+    return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 template <class F>
 class Reducer
 {
 public:
-    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set out window
         Window out_window(window);
@@ -58,51 +281,55 @@
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info());
+            f(in, out, in_slice, out_slice, *input->info(), op);
         }
         while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
     }
-    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
+        Window out_window(window);
 
         in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
 
         // Get first input and output slices
         Window in_slice  = in_window.first_slice_window_2D();
-        Window out_slice = window.first_slice_window_2D();
+        Window out_slice = out_window.first_slice_window_2D();
 
         do
         {
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 1);
+            f(in, out, in_slice, out_slice, *input->info(), 1, op);
         }
-        while(in_window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+        while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
     }
-    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
+        Window out_window(window);
 
         in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
 
         // Get first input and output slices
         Window in_slice  = in_window.first_slice_window_3D();
-        Window out_slice = window.first_slice_window_3D();
+        Window out_slice = out_window.first_slice_window_3D();
 
         do
         {
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 2);
+            f(in, out, in_slice, out_slice, *input->info(), 2, op);
         }
-        while(in_window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+        while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
     }
-    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in/out window
         Window in_window(window);
@@ -120,115 +347,278 @@
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 3);
+            f(in, out, in_slice, out_slice, *input->info(), 3, op);
         }
         while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpX
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+        auto init_res_value = static_cast<T>(0.f);
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            init_res_value = *reinterpret_cast<T *>(input.ptr());
+        }
+        else if(op == ReductionOperation::PROD)
+        {
+            init_res_value = static_cast<T>(1.f);
+        }
+        auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+        uint32x4x4_t vec_res_idx{ { 0 } };
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto in_ptr       = reinterpret_cast<const T *>(input.ptr());
             const auto vec_elements = wrapper::vloadq(in_ptr);
 
-            if(op == ReductionOperation::SUM_SQUARE)
+            switch(op)
             {
-                vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-            }
-            else
-            {
-                vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                case ReductionOperation::SUM_SQUARE:
+                    vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                    break;
+                case ReductionOperation::MEAN_SUM:
+                case ReductionOperation::SUM:
+                    vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                    break;
+                case ReductionOperation::PROD:
+                    vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                    break;
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
             }
         },
         input);
 
-        auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
-        for(int i = 0; i < S / 4; ++i)
+        switch(op)
         {
-            carry_addition = wrapper::vpadd(carry_addition, carry_addition);
-        }
+            case ReductionOperation::SUM:
+            case ReductionOperation::SUM_SQUARE:
+            case ReductionOperation::MEAN_SUM:
+            {
+                auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                for(int i = 0; i < S / 4; ++i)
+                {
+                    carry_res = wrapper::vpadd(carry_res, carry_res);
+                }
+                auto res = wrapper::vgetlane(carry_res, 0);
 
-        auto res = wrapper::vgetlane(carry_addition, 0);
-        if(op == ReductionOperation::MEAN_SUM)
-        {
-            res /= in_info.dimension(0);
-        }
+                if(op == ReductionOperation::MEAN_SUM)
+                {
+                    res /= in_info.dimension(0);
+                }
 
-        *(reinterpret_cast<T *>(output.ptr())) = res;
+                *(reinterpret_cast<T *>(output.ptr())) = res;
+                break;
+            }
+            case ReductionOperation::PROD:
+            {
+                auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                T    res       = 1;
+                for(int i = 0; i < S / 2; ++i)
+                {
+                    res *= wrapper::vgetlane(carry_res, i);
+                }
+                *(reinterpret_cast<T *>(output.ptr())) = res;
+                break;
+            }
+            case ReductionOperation::ARG_IDX_MIN:
+            case ReductionOperation::ARG_IDX_MAX:
+            {
+                auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+                *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
     }
 };
 
-template <ReductionOperation op>
 struct RedOpX_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
 
+        auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+        auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+        auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+        auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+        uint8x16_t vec_res_value = { 0 };
+
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+        }
+
+        uint32x4x4_t vec_res_idx{ { 0 } };
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto vec_elements = wrapper::vloadq(input.ptr());
+            switch(op)
+            {
+                case ReductionOperation::SUM:
+                case ReductionOperation::MEAN_SUM:
+                {
+                    const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                    const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                    const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                    const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                    const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                    const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
 
-            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+                    vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                    vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                    vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                    vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                    break;
+                }
+                case ReductionOperation::PROD:
+                {
+                    const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+                    const auto scale32x4f_4  = vdupq_n_f32(in_info.quantization_info().scale);
 
-            vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-            vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-            vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-            vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+                    const auto temp16x8t_1 = vmovl_u8(vget_low_u8(vec_elements));
+                    const auto temp16x8t_2 = vmovl_u8(vget_high_u8(vec_elements));
+
+                    const auto temp32x4t_1 = vmovl_u16(vget_low_u16(temp16x8t_1));
+                    const auto temp32x4t_2 = vmovl_u16(vget_high_u16(temp16x8t_1));
+                    const auto temp32x4t_3 = vmovl_u16(vget_low_u16(temp16x8t_2));
+                    const auto temp32x4t_4 = vmovl_u16(vget_high_u16(temp16x8t_2));
+
+                    auto temp32x4f_1 = vcvtq_f32_u32(temp32x4t_1);
+                    auto temp32x4f_2 = vcvtq_f32_u32(temp32x4t_2);
+                    auto temp32x4f_3 = vcvtq_f32_u32(temp32x4t_3);
+                    auto temp32x4f_4 = vcvtq_f32_u32(temp32x4t_4);
+
+                    //de-quantize vec_elements
+                    temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                    temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                    temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                    temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                    vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                    vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                    vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                    vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
         },
         input);
 
-        auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value3);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value4);
-
-        auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
-        carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-        auto res             = wrapper::vgetlane(carry_paddition, 0);
-
-        if(op == ReductionOperation::MEAN_SUM)
+        if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
         {
-            res /= in_info.dimension(0);
+            auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+            *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
         }
+        else if(op == ReductionOperation::PROD)
+        {
+            auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+            carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+            carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
 
-        *(output.ptr()) = static_cast<uint8_t>(res);
+            float res = wrapper::vgetlane(carry_res, 0);
+            res *= wrapper::vgetlane(carry_res, 1);
+            res *= wrapper::vgetlane(carry_res, 2);
+            res *= wrapper::vgetlane(carry_res, 3);
+
+            //re-quantize result
+            res             = sqcvt_qasymm8_f32(res, in_info.quantization_info().scale, in_info.quantization_info().offset);
+            *(output.ptr()) = static_cast<uint8_t>(res);
+        }
+        else
+        {
+            auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+
+            auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+            carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
+            auto res             = wrapper::vgetlane(carry_paddition, 0);
+
+            if(op == ReductionOperation::MEAN_SUM)
+            {
+                res /= in_info.dimension(0);
+            }
+
+            *(output.ptr()) = static_cast<uint8_t>(res);
+        }
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpYZW
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            neon_vector vec_res_value = { 0 };
+            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+            {
+                vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+            }
+            else if(op == ReductionOperation::PROD)
+            {
+                vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+            }
+            else
+            {
+                vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            }
+            uint32x4x4_t vec_res_idx{ { 0 } };
+
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
                 T *in_ptr;
@@ -248,159 +638,215 @@
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                if(op == ReductionOperation::SUM_SQUARE)
+                switch(op)
                 {
-                    vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-                }
-                else
-                {
-                    vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::SUM_SQUARE:
+                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                        break;
+                    case ReductionOperation::PROD:
+                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
                 auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                vec_sum_value      = wrapper::vmul(vec_sum_value, vec_width_inv);
+                vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+            {
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+            }
+            else
+            {
+                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+            }
         },
         input, output);
     }
 };
 
-template <ReductionOperation op>
 struct RedOpYZW_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+            uint32x4x4_t vec_res_idx{ { 0 } };
+            auto         vec_res_value1 = vdupq_n_u32(0);
+            auto         vec_res_value2 = vdupq_n_u32(0);
+            auto         vec_res_value3 = vdupq_n_u32(0);
+            auto         vec_res_value4 = vdupq_n_u32(0);
+
+            auto vec_res_value1_f = vdupq_n_f32(1);
+            auto vec_res_value2_f = vdupq_n_f32(1);
+            auto vec_res_value3_f = vdupq_n_f32(1);
+            auto vec_res_value4_f = vdupq_n_f32(1);
+
+            auto vec_res_value = wrapper::vloadq(input.ptr());
+
+            for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
             {
                 uint8_t *in_ptr;
                 switch(axis)
                 {
                     case 1:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim));
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
                         break;
                     case 2:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim));
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
                         break;
                     case 3:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim));
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Not supported");
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                switch(op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
 
-                vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-                vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-                vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-                vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+                        const auto scale32x4f_4  = vdupq_n_f32(in_info.quantization_info().scale);
+
+                        const auto temp16x8t_1 = vmovl_u8(vget_low_u8(vec_elements));
+                        const auto temp16x8t_2 = vmovl_u8(vget_high_u8(vec_elements));
+
+                        const auto temp32x4t_1 = vmovl_u16(vget_low_u16(temp16x8t_1));
+                        const auto temp32x4t_2 = vmovl_u16(vget_high_u16(temp16x8t_1));
+                        const auto temp32x4t_3 = vmovl_u16(vget_low_u16(temp16x8t_2));
+                        const auto temp32x4t_4 = vmovl_u16(vget_high_u16(temp16x8t_2));
+
+                        auto temp32x4f_1 = vcvtq_f32_u32(temp32x4t_1);
+                        auto temp32x4f_2 = vcvtq_f32_u32(temp32x4t_2);
+                        auto temp32x4f_3 = vcvtq_f32_u32(temp32x4t_3);
+                        auto temp32x4f_4 = vcvtq_f32_u32(temp32x4t_4);
+
+                        //de-quantize vec_elements
+                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
-                const auto vec_width_inv    = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
-                const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
-                const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
-                const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
-                const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
+                const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
+                vec_res_value1_f         = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+                vec_res_value2_f         = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+                vec_res_value3_f         = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+                vec_res_value4_f         = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
 
-                vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
-                vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
-                vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
-                vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
+                vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+                vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+                vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+                vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
+            }
+            else if(op == ReductionOperation::PROD)
+            {
+                const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+                const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(in_info.quantization_info().scale));
+
+                //re-quantize
+                vec_res_value1_f = vaddq_f32(vmulq_f32(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                vec_res_value2_f = vaddq_f32(vmulq_f32(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                vec_res_value3_f = vaddq_f32(vmulq_f32(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                vec_res_value4_f = vaddq_f32(vmulq_f32(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+                vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+                vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+                vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+                vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
             }
 
-            const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
-            const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
-            auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-            wrapper::vstore(output.ptr(), res);
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+            {
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
+            }
+            else
+            {
+                const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+                wrapper::vstore(output.ptr(), res);
+            }
+
         },
         input, output);
     }
 };
 
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 2:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 3:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
-    }
-}
-
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     switch(axis)
     {
@@ -408,13 +854,13 @@
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
+                    return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
+                    return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
+                    return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -422,13 +868,13 @@
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -436,13 +882,13 @@
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -450,13 +896,13 @@
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -464,84 +910,13 @@
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
 }
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 2:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 3:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
-    }
-}
-
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
-    TensorShape output_shape{ input_shape };
-    output_shape.set(axis, 1);
-
-    return output_shape;
-}
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_UNUSED(op);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
@@ -549,10 +924,19 @@
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+        if(!is_arg_min_max)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+        }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
-        const TensorShape output_shape         = calculate_output_shape(input->tensor_shape(), axis);
+        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
         const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
@@ -560,13 +944,15 @@
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Calculate output shape and set if empty
-    const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
+    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+    auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
 
@@ -609,7 +995,7 @@
     _reduction_axis = axis;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
@@ -619,7 +1005,7 @@
 Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
 
     return Status{};
 }
@@ -630,19 +1016,6 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_op)
-    {
-        case ReductionOperation::SUM_SQUARE:
-            reduce_sumsq(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::MEAN_SUM:
-            reduce_mean_sum(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::SUM:
-            reduce_sum(window, _input, _output, _reduction_axis);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction operation.");
-    }
+    reduce_op(window, _input, _output, _reduction_axis, _op);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index c718991..649fba3 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,11 +43,13 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
 
     return Status{};
 }

diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
new file mode 100644
index 0000000..62e4882
--- /dev/null
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp

@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <array>
+#include <cmath>
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+NEReverseKernel::NEReverseKernel()
+    : _input(nullptr), _output(nullptr), _axis(nullptr)
+{
+}
+
+void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+
+    _input  = input;
+    _output = output;
+    _axis   = axis;
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+
+    // Configure kernel window
+    INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+
+    return Status{};
+}
+
+template <typename T>
+void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output)
+{
+    int axis_bit = 0;
+    for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+    {
+        const int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+        axis_bit |= 1 << axis_i;
+    }
+
+    // Check if we need a left-over loop for the y dimension
+    const int window_step_x            = 16 / input->info()->element_size();
+    const int window_start_x           = window.x().start();
+    const int window_end_x             = std::min(window.x().end(), static_cast<int>(input->info()->dimension(0)));
+    const int window_end_x_multiple_of = ((window_end_x - window_start_x) / window_step_x) * window_step_x;
+    bool      left_over_loop_x         = (((window_end_x - window_start_x) % window_step_x) != 0);
+
+    Window slice = window.first_slice_window_4D();
+
+    if(left_over_loop_x)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if(window_end_x_multiple_of > window_start_x)
+        {
+            slice.set(Window::DimX, Window::Dimension(window_start_x, window_end_x_multiple_of, window_step_x));
+        }
+        else
+        {
+            slice.set(Window::DimX, Window::Dimension(0, 0, 1));
+        }
+    }
+
+    do
+    {
+        Iterator input_it(input, slice);
+        execute_window_loop(slice, [&](const Coordinates & id)
+        {
+            auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()));
+
+            // Reverse 0 axis
+            if(axis_bit & 0x1)
+            {
+                in = wrapper::vrev64(in);
+                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+            }
+
+            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - id.x() - window_step_x : id.x();
+            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+            auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+            wrapper::vstore(out_ptr, in);
+        },
+        input_it);
+
+        if(left_over_loop_x)
+        {
+            slice.set(Window::DimX, Window::Dimension(window_end_x_multiple_of, window_end_x, 1));
+
+            Iterator input_it(input, slice);
+
+            // Compute left-over elements along the y dimension (1x1)
+            execute_window_loop(slice, [&](const Coordinates & id)
+            {
+                const auto in = *reinterpret_cast<T *>(input_it.ptr());
+
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - id.x() - 1 : id.x();
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+                *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
+            },
+            input_it);
+        }
+
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+
+void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+            run_reverse<float>(window, _input, _axis, _output);
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            run_reverse<float16_t>(window, _input, _axis, _output);
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::U32:
+            run_reverse<uint32_t>(window, _input, _axis, _output);
+            break;
+        case DataType::S32:
+            run_reverse<int32_t>(window, _input, _axis, _output);
+            break;
+        case DataType::S16:
+            run_reverse<int16_t>(window, _input, _axis, _output);
+            break;
+        case DataType::U16:
+            run_reverse<uint16_t>(window, _input, _axis, _output);
+            break;
+        case DataType::QASYMM8:
+        case DataType::U8:
+            run_reverse<uint8_t>(window, _input, _axis, _output);
+            break;
+        case DataType::S8:
+            run_reverse<int8_t>(window, _input, _axis, _output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7111644..3d300ef 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -46,11 +47,12 @@
                           const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
                           BorderMode border_mode, SamplingPolicy sampling_policy)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(border_mode);
 
     const DataLayout data_layout = input->data_layout();
@@ -72,6 +74,7 @@
     if(policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     }
 
     return Status{};
@@ -182,7 +185,7 @@
 
 template <typename T>
 inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
-                                     float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
 {
     Iterator in(input, win_in);
     Iterator out(output, window);
@@ -202,12 +205,16 @@
 
     int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
 
+    const bool             is_quantized = (input->info()->data_type() == DataType::QASYMM8);
+    const QuantizationInfo iq_info      = input->info()->quantization_info();
+    const QuantizationInfo oq_info      = output->info()->quantization_info();
+
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const auto offset     = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
         const auto dx_scale   = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
         const auto dy_scale   = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int  in_yi      = std::floor((id.z() + 0.5f) * hr - 0.5f);
+        const int  in_yi      = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
         const int  offset_row = in_yi * stride_h + id.x() * stride_c;
         const T   *in_ptr     = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
 
@@ -251,8 +258,22 @@
             const float w3 = dx1 * dy_scale;
             const float w4 = dx_scale * dy_scale;
 
+            T res = 0;
+            //dequantize quantized input
+            if(is_quantized)
+            {
+                float inp00 = iq_info.dequantize(a00);
+                float inp01 = iq_info.dequantize(a01);
+                float inp10 = iq_info.dequantize(a10);
+                float inp11 = iq_info.dequantize(a11);
+                res         = static_cast<T>(oq_info.quantize((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), RoundingPolicy::TO_NEAREST_UP));
+            }
+            else
+            {
+                res = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+            }
             // Store result
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+            *reinterpret_cast<T *>(out.ptr()) = res;
         }
         else
         {
@@ -273,7 +294,7 @@
 } // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode()
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _sampling_offset(0)
 {
 }
 
@@ -309,6 +330,11 @@
     _border_size = BorderSize(1);
     _border_mode = border_mode;
 
+    if(sampling_policy == SamplingPolicy::CENTER)
+    {
+        _sampling_offset = 0.5f;
+    }
+
     // Compute the ratio between source width/height and destination width/height
     const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
     const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
@@ -387,6 +413,7 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             uint8x16_t tmp = vdupq_n_u8(0);
@@ -463,6 +490,48 @@
             in, offsets, out);
             break;
         }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+        {
+            float16x8x2_t tmp =
+            {
+                {
+                    vdupq_n_f16(0),
+                    vdupq_n_f16(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
+
+                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
         {
             float32x4x4_t tmp =
@@ -515,7 +584,7 @@
 
 void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
 
     // Compute the ratio between source height and destination height
     const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -545,8 +614,13 @@
     const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
     const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
 
+    const bool             is_quantized = (_input->info()->data_type() == DataType::QASYMM8);
+    const QuantizationInfo iq_info      = _input->info()->quantization_info();
+    const QuantizationInfo oq_info      = _output->info()->quantization_info();
+
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             execute_window_loop(window, [&](const Coordinates & id)
@@ -556,29 +630,55 @@
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
                 const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 uint8x8_t tmp0 = vdup_n_u8(0);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
-
+                if(is_quantized)
+                {
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
+                }
+                else
+                {
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+                }
                 uint8x8_t tmp1 = vdup_n_u8(0);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
-
+                if(is_quantized)
+                {
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
+                }
+                else
+                {
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+                }
                 vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
             },
             in, offsets, dx, dy, out);
@@ -592,7 +692,7 @@
                 const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 int16x8x2_t tmp =
@@ -626,6 +726,50 @@
             in, offsets, dx, dy, out);
             break;
         }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                float16x8x2_t tmp =
+                {
+                    {
+                        vdupq_n_f16(0),
+                        vdupq_n_f16(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
+
+                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
         {
             execute_window_loop(window, [&](const Coordinates & id)
@@ -634,7 +778,7 @@
                 const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 float32x4x4_t tmp =
@@ -751,6 +895,7 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
@@ -759,7 +904,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
@@ -772,11 +917,27 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
         }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+        {
+            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+            {
+                scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
+                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c);
+            }
+            else
+            {
+                scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+                                                    window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+            }
+            break;
+        }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
@@ -785,7 +946,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                 window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;

diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
new file mode 100644
index 0000000..f2697bc
--- /dev/null
+++ b/src/core/NEON/kernels/NESelectKernel.cpp

@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "utils/TypePrinter.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename ScalarType, typename VectorType>
+void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator condition(cond, win);
+    Iterator input1(in1, win);
+    Iterator input2(in2, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+        int x = window_start_x;
+        for(; x <= limit; x += window_step_x)
+        {
+            const auto c = (*condition_conversion)(condition_ptr + x);
+            const auto a = wrapper::vloadq(input1_ptr + x);
+            const auto b = wrapper::vloadq(input2_ptr + x);
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            const auto c      = *(condition_ptr + x);
+            const auto a      = *(input1_ptr + x);
+            const auto b      = *(input2_ptr + x);
+            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+        }
+    },
+    condition, input1, input2, output);
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+    });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
+    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
+    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+    int       offset     = 0;
+    const int step       = 16 / in1->info()->element_size();
+
+    for(int i = 0; i < outer_size; ++i)
+    {
+        int        x         = offset;
+        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+        for(; x <= offset + inner_size - step; x += step)
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+        }
+        if(x <= offset + inner_size - (step / 2))
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+            x += step / 2;
+        }
+        for(; x < offset + inner_size; ++x)
+        {
+            *(output_ptr + x) = *(input_ptr + x);
+        }
+        offset += inner_size;
+    }
+}
+} // namespace
+
+NESelectKernel::NESelectKernel()
+    : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+{
+}
+
+void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
+
+    // Auto initialize output if not initialized
+    auto_init_if_empty(*output->info(), x->info()->tensor_shape(), 1, x->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(validate(c->info(), x->info(), y->info(), output->info()));
+
+    _c             = c;
+    _x             = x;
+    _y             = y;
+    _output        = output;
+    _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
+
+    std::string function_to_call("op_");
+    function_to_call += string_from_data_type(x->info()->data_type());
+
+    static std::map<std::string, SelectFunction *> map_function;
+
+    if(_has_same_rank)
+    {
+        map_function =
+        {
+            { "op_S8", &select_op_8<int8_t, uint8x16_t> },
+            { "op_S16", &select_op_16<int16_t, uint16x8_t> },
+            { "op_S32", &select_op_32<int32_t, uint32x4_t> },
+            { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
+            { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
+            { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
+            { "op_F32", &select_op_32<float, uint32x4_t> }
+        };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    }
+    else
+    {
+        map_function =
+        {
+            { "op_S8", &select_op_not_same_rank<int8_t> },
+            { "op_S16", &select_op_not_same_rank<int16_t> },
+            { "op_S32", &select_op_not_same_rank<int32_t> },
+            { "op_U8", &select_op_not_same_rank<uint8_t> },
+            { "op_U16", &select_op_not_same_rank<uint16_t> },
+            { "op_U32", &select_op_not_same_rank<uint32_t> },
+            { "op_F32", &select_op_not_same_rank<float> }
+        };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    }
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        _function = it->second;
+    }
+
+    Window win = calculate_max_window(x->info()->valid_region());
+    INEKernel::configure(win);
+}
+
+Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(x,
+                                                         1,
+                                                         DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, y);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::U8);
+
+    const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
+    }
+
+    return Status{};
+}
+
+void NESelectKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_function == nullptr);
+    _function(_c, _x, _y, _output, window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 0f416de..e9417ec 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -34,7 +34,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/core/utils/misc/SaturateCast.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -667,7 +667,7 @@
             /* Run remaining elements */
             for(; i < input_width; ++i)
             {
-                out_ptr[i] = utility::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+                out_ptr[i] = utils::cast::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
             }
         }
     },

diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
new file mode 100644
index 0000000..0c33f36
--- /dev/null
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+{
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input);
+
+    return std::make_pair(Status{}, win);
+}
+
+inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+{
+    constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
+    Coordinates   id_out        = id;
+    for(unsigned int i = max_out_coord - 1; i > axis; --i)
+    {
+        id_out.set(i, id[i - 1]);
+    }
+    id_out.set(axis, idx_input);
+    return id_out;
+}
+} // namespace
+
+NEStackLayerKernel::NEStackLayerKernel()
+    : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+{
+}
+
+void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+
+    _input     = input;
+    _output    = output;
+    _axis      = axis;
+    _idx_input = idx_input;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NEStackLayerKernel::run_stack<uint8_t>;
+            break;
+        case 2:
+            _func = &NEStackLayerKernel::run_stack<uint16_t>;
+            break;
+        case 4:
+            _func = &NEStackLayerKernel::run_stack<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    return Status{};
+}
+
+void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}
+
+template <typename T>
+void NEStackLayerKernel::run_stack(const Window &window)
+{
+    Window window_out;
+    window_out.use_tensor_dimensions(_output->info()->tensor_shape());
+
+    Iterator input(_input, window);
+    Iterator output(_output, window_out);
+
+    const int stride_x = _output->info()->strides_in_bytes()[0];
+    const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
+    const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
+    const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
+    const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        Coordinates id_out                           = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+        const int   idx                              = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+        *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+    },
+    input);
+}

diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
new file mode 100644
index 0000000..2ae029b
--- /dev/null
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp

@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
+    {
+        return i == 0;
+    }));
+
+    // Get expected output shape
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+                                                                                                          starts, ends, strides,
+                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
+
+    // Checks output if configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    // Output tensor auto initialization if not yet initialized
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+                                                                                                      starts, ends, strides,
+                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+    // Create window
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+
+void strided_slice_generic(const ITensor *input, ITensor *output,
+                           const Coordinates &starts, const BiStrides &strides, int32_t shrink_axis_mask,
+                           const Window &window)
+{
+    Iterator     output_it(output, window);
+    const size_t width_size = input->info()->element_size();
+
+    const bool is_shrink_w = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
+    const bool is_shrink_h = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 1);
+    const bool is_shrink_c = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 2);
+    const bool is_shrink_n = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 3);
+
+    unsigned int index = 0;
+    const int    idx_w = is_shrink_w ? 0 : index++;
+    const int    idx_h = is_shrink_h ? 0 : index++;
+    const int    idx_c = is_shrink_c ? 0 : index++;
+    const int    idx_n = is_shrink_n ? 0 : index;
+
+    BiStrides shrinked_strides;
+    shrinked_strides.set(0, is_shrink_w ? 0 : strides[0]);
+    shrinked_strides.set(1, is_shrink_h ? 0 : strides[1]);
+    shrinked_strides.set(2, is_shrink_c ? 0 : strides[2]);
+    shrinked_strides.set(3, is_shrink_n ? 0 : strides[3]);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int w_coord = starts[0] + (id[idx_w] * shrinked_strides[0]);
+        const int h_coord = starts[1] + (id[idx_h] * shrinked_strides[1]);
+        const int c_coord = starts[2] + (id[idx_c] * shrinked_strides[2]);
+        const int n_coord = starts[3] + (id[idx_n] * shrinked_strides[3]);
+
+        Coordinates in_coords(w_coord, h_coord, c_coord, n_coord);
+        std::copy_n(input->ptr_to_element(in_coords), width_size, output_it.ptr());
+    },
+    output_it);
+}
+} // namespace
+
+NEStridedSliceKernel::NEStridedSliceKernel()
+    : _input(nullptr), _output(nullptr), _starts_abs(), _final_strides(), _shrink_mask()
+{
+}
+
+void NEStridedSliceKernel::configure(const ITensor *input, ITensor *output,
+                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+
+    _input       = input;
+    _output      = output;
+    _shrink_mask = shrink_axis_mask;
+
+    const TensorShape &input_shape = input->info()->tensor_shape();
+
+    Coordinates ends_abs;
+    std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
+                                                          input_shape,
+                                                          starts, ends, strides,
+                                                          begin_mask, end_mask, shrink_axis_mask);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
+                                .first);
+
+    return Status{};
+}
+
+void NEStridedSliceKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Dispatch kernel
+    strided_slice_generic(_input, _output, _starts_abs, _final_strides, _shrink_mask, window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
new file mode 100644
index 0000000..dbeacfa
--- /dev/null
+++ b/src/core/NEON/kernels/NETileKernel.cpp

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
+    {
+        return e == 0;
+    }));
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+NETileKernel::NETileKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NETileKernel::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Auto initialize output
+    TensorShape tiled_shape = misc::shape_calculator::compute_tiled_shape(input->info()->tensor_shape(), multiples);
+    auto_init_if_empty(*output->info(), tiled_shape, 1, input->info()->data_type());
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), multiples));
+
+    _input  = input;
+    _output = output;
+
+    // Configure window without padding
+    Window win = calculate_max_window(*output->info());
+    INEKernel::configure(win);
+}
+
+Status NETileKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, multiples));
+    return Status{};
+}
+
+void NETileKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window output_window{ window };
+    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+    Window out_slice = output_window.first_slice_window_1D();
+
+    const auto src_shape = _input->info()->tensor_shape();
+    do
+    {
+        Iterator output_it(_output, out_slice);
+
+        execute_window_loop(out_slice, [&](const Coordinates & id)
+        {
+            const size_t x = id.x();
+            const size_t y = id.y();
+            const size_t z = id.z();
+            const size_t w = id[3];
+            Coordinates  input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
+            memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
+        },
+        output_it);
+    }
+    while(output_window.slide_window_slice_1D(out_slice));
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 870d2c9..a0a8b82 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,6 +86,7 @@
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 5dca58e..aae85c6 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -95,6 +96,7 @@
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
@@ -106,6 +108,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     const int num_elems_processed_per_iteration_x = 16 / input->element_size();

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 259f4fc..4a0cf27 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -120,6 +120,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index a84a6d9..aea6875 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -56,6 +57,7 @@
 Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8,
                                                          DataType::U16, DataType::S16, DataType::F16,
@@ -110,15 +112,28 @@
     uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _width_offset * _output->info()->strides_in_bytes()[0];
 
     // Create iterators
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    Iterator                input(_input, window);
+    Iterator                output(_output, window);
+    const DataType          dt           = _input->info()->data_type();
+    const QuantizationInfo &input_qinfo  = _input->info()->quantization_info();
+    const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+    if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
     {
-        const auto in_ptr  = input.ptr();
-        const auto out_ptr = output_ptr + output.offset();
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            vst1q_u8(output_ptr + output.offset(), vquantize(vdequantize(vld1q_u8(input.ptr()), input_qinfo), output_qinfo));
+        },
+        input, output);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates &)
+        {
+            const auto in_ptr  = input.ptr();
+            const auto out_ptr = output_ptr + output.offset();
 
-        wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
-    },
-    input, output);
+            wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+        },
+        input, output);
+    }
 }

diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index 009562b..09a4a11 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
 {
     ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 9194bdd..b561659 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,75 +38,60 @@
 
 namespace arm_gemm {
 
-#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
-
-    UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
-        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args));
-    }
-
-    GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-
-#elif defined(__aarch64__)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
+static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
+#if defined(__ARM_FEATURE_SVE)
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "interleaved_fp16_mla_3VLx8",
+    [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
+    [](const GemmArgs<__fp16> &args) { return true; },
+    [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+},
+#endif
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "hgemm_24x8",
+    [](const GemmArgs<__fp16> &args) {
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    bool is_supported(const GemmArgs<__fp16> &args) override {
         return args._ci->has_fp16();
-    }
-#endif
-
-    UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
-        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args));
-    }
-
-    GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-#endif
-
-#endif // __aarch64__
-
-class GemmImpl_gemm_fp16_interleaved : public GemmImplementation<__fp16, __fp16> {
-public:
-    UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
-#ifdef __aarch64__
-        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args));
-#elif defined(__arm__)
-        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args));
 #else
-# error Unknown Architecture
+        return true;
 #endif
-    }
-
-    GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
-static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{};
+    },
+    [](const GemmArgs<__fp16> &args) { return true; },
+    [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
+},
 #endif
-static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{};
-
-static std::vector<GemmImplementation<__fp16, __fp16> *> gemm_fp16_methods = {
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
-    &gemm_fp16_interleaved_fp16_impl,
+#if defined(__arm__)
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sgemm_8x6",
+    [](const GemmArgs<__fp16> &args) { return true; },
+    [](const GemmArgs<__fp16> &args) { return true; },
+    [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
+},
 #endif
-    &gemm_fp16_interleaved_impl
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr,
+}
 };
 
 template<>
-std::vector<GemmImplementation<__fp16, __fp16> *> &gemm_implementation_list<__fp16, __fp16>() {
+const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp16>() {
     return gemm_fp16_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(GemmArgs<__fp16> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<__fp16, __fp16>(GemmArgs<__fp16> &args);
-template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, GemmArgs<__fp16> &args);
+template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
+template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
 
 } // namespace arm_gemm
 
-#endif // __ARM_FP16_ARGS
+#endif // __ARM_FP16_ARGS
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 7d14971..8bc33cc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemm_native.hpp"
@@ -30,112 +31,140 @@
 #include "gemv_native_transposed.hpp"
 #include "gemv_pretransposed.hpp"
 
-#include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_sgemv_trans.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
 
+#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
 #include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_smallK_fp32_mla_1VLx4.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp"
 
 namespace arm_gemm {
 
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-// SGEMM implementations for AArch64 without SVE
+static const GemmImplementation<float, float> gemm_fp32_methods[] =
+{
+{
+    GemmMethod::GEMV_BATCHED,
+    "gemv_batched",
+    [](const GemmArgs<float> &args) { return (args._Msize==1) && (args._nbatches>1); },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemvBatched<float, float>(args); }
+},
+#ifdef __aarch64__
+{
+    GemmMethod::GEMV_PRETRANSPOSED,
+    "sgemv_pretransposed",
+    [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1); },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+},
+{
+    GemmMethod::GEMV_NATIVE_TRANSPOSED,
+    "sgemv_trans",
+    [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1); },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemvNativeTransposed<sgemv_trans, float, float>(args); }
+},
 
-// Pretransposed GEMV
-class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation<float, float> {
-public:
-    bool is_supported(const GemmArgs<float> &args) override {
-        return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1);
-    }
+#ifdef __ARM_FEATURE_SVE
+        // SVE smallk / native / hybrid methods
+{
+    GemmMethod::GEMM_HYBRID,
+    "smallK_hybrid_fp32_mla_1VLx4",
+    [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && args._alpha==1.0f && args._pretransposed_hint; },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp32_mla_4VLx4",
+    [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_NATIVE,
+    "smallK_fp32_mla_1VLx4",
+    [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && !args._trB && args._alpha==1.0f; },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemmNative<smallK_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_NATIVE,
+    "native_fp32_mla_4VLx4",
+    [](const GemmArgs<float> &args) { return (args._Ksize>4 && args._alpha==1.0f && !args._trA && !args._trB); },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmNative<native_fp32_mla_4VLx4, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
 
-    UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-        return UniqueGemmCommon<float, float> (new GemvPretransposed<sgemv_pretransposed, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._trB, args._beta));
-    }
+// NEON native / hybrid methods
+{
+    GemmMethod::GEMM_HYBRID,
+    "sgemm_nativeA_pretransposeB_16x4",
+    [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_NATIVE,
+    "sgemm_native_16x4",
+    [](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmNative<sgemm_native_16x4, float, float>(args); }
+},
 
-    GemmImpl_sgemm_gemv_pretransposed() : GemmImplementation<float, float>(GemmMethod::GEMV_PRETRANSPOSED) { }
-};
-
-// Native GEMV
-class GemmImpl_sgemm_gemv_native_transposed : public GemmImplementation<float, float> {
-public:
-    bool is_supported(const GemmArgs<float> &args) override {
-        return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1);
-    }
-
-    UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-        return UniqueGemmCommon<float, float> (new GemvNativeTransposed<sgemv_trans, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._beta));
-    }
-
-    GemmImpl_sgemm_gemv_native_transposed() : GemmImplementation<float, float>(GemmMethod::GEMV_NATIVE_TRANSPOSED) { }
-};
-
-// Native GEMM
-class GemmImpl_sgemm_gemm_native : public GemmImplementation<float, float> {
-public:
-    bool is_supported(const GemmArgs<float> &args) override {
-        return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB);
-    }
-
-    bool is_recommended(const GemmArgs<float> &args) override {
-        return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8));
-    }
-
-    UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-        return UniqueGemmCommon<float, float> (new GemmNative<sgemm_native_16x4, float, float>(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, args._beta));
-    }
-
-    GemmImpl_sgemm_gemm_native() : GemmImplementation<float, float>(GemmMethod::GEMM_NATIVE) { }
-};
+#ifdef __ARM_FEATURE_SVE
+        {
+    GemmMethod::GEMM_INTERLEAVED,
+    "interleaved_fp32_mla_3VLx8",
+    [](const GemmArgs<float> &args) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sgemm_12x8",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+},
 #endif // __aarch64__
 
-// Interleaved GEMM
-class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation<float, float> {
-public:
-    UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-#ifdef __ARM_FEATURE_SVE
-        return UniqueGemmCommon<float, float> (new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args));
-#elif defined(__aarch64__)
-        return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(args));
-#elif defined(__arm__)
-        return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(args));
-#else
-# error Unknown Architecture.
-#endif
-    }
-
-    GemmImpl_sgemm_gemm_interleaved() : GemmImplementation<float, float>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemv_batched<float, float> gemv_batched_impl{};
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{};
-static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{};
-static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{};
-#endif
-static GemmImpl_sgemm_gemm_interleaved sgemm_gemm_interleaved_impl{};
-
-/* List of implementations (order matters) */
-static std::vector<GemmImplementation<float, float> *> SGemmMethods = {
-    &gemv_batched_impl,
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-    &sgemm_gemv_pretransposed_impl,
-    &sgemm_gemv_native_transposed_impl,
-    &sgemm_gemm_native_impl,
-#endif
-    &sgemm_gemm_interleaved_impl
+#ifdef __arm__
+        {
+    GemmMethod::GEMM_INTERLEAVED,
+    "sgemm_8x6",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_8x6, float, float>(args); }
+},
+#endif // __arm__
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
 };
 
 /* Templated function to return this list. */
 template<>
-std::vector<GemmImplementation<float, float> *> &gemm_implementation_list<float, float>() {
-    return SGemmMethods;
+const GemmImplementation<float, float> *gemm_implementation_list<float, float>() {
+    return gemm_fp32_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<float, float> gemm<float, float>(GemmArgs<float> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<float, float>(GemmArgs<float> &args);
-template bool method_is_compatible<float, float>(GemmMethod method, GemmArgs<float> &args);
+template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
+template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
+template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
+template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
 
-} // namespace arm_gemm
+} // namespace arm_gemm
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
new file mode 100644
index 0000000..c2bd0bb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp

@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "ndrange.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybrid : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo * const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    const bool _trB;
+
+    const Tr _beta;
+
+    /* Blocking info */
+    const unsigned int _k_block;
+    const unsigned int _n_block;
+    const unsigned int _Mround;
+
+    /* Pretransposed buffer. */
+    const Toi *_B_transposed=nullptr;
+
+    const NDRange<4> _window_range;
+
+    static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
+    }
+
+    static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+        if (args._cfg && args._cfg->outer_block_size) {
+            return args._cfg->outer_block_size;
+        }
+
+        const unsigned int k_block = compute_k_block(args);
+        const unsigned int L2_size = args._ci->get_L2_cache_size();
+
+        // n_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                               (sizeof(Toi) * k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        n_block /= strategy::out_width();
+        n_block = std::max(n_block, 1U) * strategy::out_width();
+
+        // And tune to the presented problem size.
+        unsigned int numblocks = iceildiv(args._Nsize, n_block);
+        n_block = iceildiv(args._Nsize, numblocks);
+        n_block = roundup(n_block, strategy::out_width());
+
+        return n_block;
+    }
+
+public:
+    GemmHybrid(GemmHybrid &) = delete;
+    GemmHybrid & operator= (GemmHybrid &) = delete;
+
+    /* Constructor */
+    GemmHybrid(const GemmArgs<Tr> &args)
+            : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+              _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta),
+              _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+              _Mround(roundup(args._Msize, strategy::out_height())),
+              _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }
+
+    // Interface implementation - Compulsory functions
+    unsigned int get_window_size() const override {
+        return _window_range.total_size();
+    }
+
+    // This kernel can always be dynamically scheduled.
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
+    // Execute
+    void execute(unsigned int start, unsigned int end, int threadid) override {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_ci);
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+        /* For now, each work item implies all the K for a given output
+         * pixel (so we don't need to synchronize access to the output
+         * array).  So separate the loop over K blocks here.  */
+        for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+            unsigned int kmax   = std::min(k0 + _k_block, _Ksize);
+            unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+            auto p = _window_range.iterator(start, end);
+
+            if (p.done()) {
+                return;
+            }
+
+            do {
+                const unsigned int m_start = p.dim(0) * strategy::out_height();
+                const unsigned int m_end   = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+                const unsigned int batch   = p.dim(1);
+                const unsigned int n0      = p.dim(2) * _n_block;
+                const unsigned int nmax    = std::min(n0 + _n_block, _Nsize);
+                const unsigned int multi   = p.dim(3);
+
+                const Toi *b_panel = _B_transposed +
+                                     (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+                                     (k0 * roundup(_Nsize, strategy::out_width())) +
+                                     (n0 * kern_k);
+
+#ifdef CYCLE_PROFILING
+                auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+
+                strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+                             b_panel,
+                             this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+                             (k0 == 0) ? _beta : static_cast<Tr>(1),
+                             (m_end - m_start), (nmax - n0), kern_k);
+            } while (p.next_dim1());
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return (_B_transposed==nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override {
+        return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
+    }
+
+    using GemmCommon<To, Tr>::pretranspose_B_array;
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed = buffer;
+        strategy strat(_ci);
+
+        for (unsigned int multi=0; multi<_nmulti; multi++) {
+            for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+                const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+                const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
+
+                for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+                    const unsigned int xmax = std::min(x0+_n_block, _Nsize);
+
+                    const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
+
+                    strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+                                               x0, xmax, k0, kmax, _trB);
+
+                    buffer += size;
+                }
+            }
+        }
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 6734e3c..bf80784 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,56 +22,53 @@
  * SOFTWARE.
  */
 
-#include "gemv_batched.hpp"
+#include <arm_gemm.hpp>
+
+#include <functional>
 
 namespace arm_gemm {
 
 template<typename Top, typename Tret>
-class GemmImplementation {
-public:
-    /* Is this implementation compatible with the args as provided? */
-    virtual bool is_supported(const GemmArgs<Tret> &args)   { return true; }
-    /* Is this implementation "recommended" for these args (heuristic)? */
-    virtual bool is_recommended(const GemmArgs<Tret> &args) { return true; }
-    /* Instantiate this method please. */
-    virtual UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) = 0;
-
-    /* Indicate the "GemmMethod" for use as a selector */
-    const GemmMethod method;
-
-    virtual ~GemmImplementation() { }
-
-    GemmImplementation(GemmMethod method) : method(method) { }
-};
-
-/* "gemv_batched" implementation is type-agnostic, so template it here. */
-template<typename Top, typename Tret>
-class GemmImpl_gemv_batched : public GemmImplementation<Top, Tret> {
-public:
-    bool is_supported(const GemmArgs<Tret> &args) override {
-        return (args._Msize==1 && args._nbatches > 1);
-    }
-
-    UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) override {
-        return UniqueGemmCommon<Top, Tret> (new GemvBatched<Top, Tret>(args));
-    }
-
-    GemmImpl_gemv_batched() : GemmImplementation<Top, Tret>(GemmMethod::GEMV_BATCHED) { }
+struct GemmImplementation {
+    const GemmMethod                                               method;
+    const char *                                                   name;
+    std::function<bool(const GemmArgs<Tret> &)>                    is_supported;
+    std::function<bool(const GemmArgs<Tret> &)>                    is_recommended;
+    std::function<GemmCommon<Top, Tret> *(const GemmArgs<Tret> &)> instantiate;
 };
 
 /* "Master" function implemented for each valid combination of types.
  * Returns a list of GEMM implementation descriptors for processing by the
- * other functions.  */
+ * other functions, terminated by an implementation with
+ * method==GemmMethod::DEFAULT.  */
 template<typename Top, typename Tret>
-std::vector<GemmImplementation<Top, Tret> *> &gemm_implementation_list();
+const GemmImplementation<Top, Tret> *gemm_implementation_list();
 
+/*
+ * Select a GEMM implementation for the given arguments.
+ *
+ * The logic here returns the first method on the list which supports the
+ * requested problem parameters, matches the provided filters (method and/or
+ * name string match) and recommends itself.
+ *
+ * If there is no such method, it will return the first method which
+ * supports the requested parameters and passes the filters, regardless of
+ * recommendation.
+ *
+ * If no method supports the requested parameters and passes the filters,
+ * this function returns false and doesn't touch the provided pointer
+ * reference.
+ */
 template<typename Top, typename Tret>
-GemmImplementation<Top, Tret> *find_implementation(GemmArgs<Tret> &args, GemmConfig *cfg) {
+bool find_implementation(const GemmArgs<Tret> &args, const GemmImplementation<Top, Tret> * &impl) {
     auto gemms = gemm_implementation_list<Top, Tret>();
+    const GemmConfig *cfg = args._cfg;
 
-    for(auto &&i : gemms) {
+    const GemmImplementation<Top, Tret> *saved_impl = nullptr;
+
+    for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Skip if this implementation doesn't support these args. */
-        if (!i->is_supported(args)) {
+        if (i->is_supported != nullptr && !i->is_supported(args)) {
             continue;
         }
 
@@ -80,52 +77,92 @@
             continue;
         }
 
-        /* If no specific method is requested, check that this method recommends itself. */
-        if ((!cfg || cfg->method == GemmMethod::DEFAULT) && !i->is_recommended(args)) {
+        /* Skip if a filter is to be applied and it doesn't match. */
+        if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
             continue;
         }
 
-        return i;
+        /* At this point, if we don't have a saved implementation, save this
+         * one.  This is so that we always return something if a filter
+         * matches, even if it doesn't recommend itself.
+         */
+        if (saved_impl == nullptr) {
+            saved_impl=i;
+        }
+
+        /* Check that this method recommends itself. */
+        if (i->is_recommended != nullptr && !i->is_recommended(args)) {
+            continue;
+        }
+
+        impl=i;
+
+        return true;
     }
 
-    return nullptr;
-}
-
-template<typename Top, typename Tret>
-UniqueGemmCommon<Top, Tret> gemm(GemmArgs<Tret> &args, GemmConfig *cfg) {
-    auto impl = find_implementation<Top, Tret>(args, cfg);
-
-    if (impl) {
-        return impl->instantiate(args);
-    }
-
-    return UniqueGemmCommon<Top, Tret>(nullptr);
-}
-
-template<typename Top, typename Tret>
-GemmMethod get_gemm_method(GemmArgs<Tret> &args) {
-    auto impl = find_implementation<Top, Tret>(args, nullptr);
-
-    if (impl) {
-        return impl->method;
-    }
-
-    /* This shouldn't happen - there should always be at least one valid implementation. */
-    return GemmMethod::DEFAULT;
-}
-
-template<typename Top, typename Tret>
-bool method_is_compatible(GemmMethod method, GemmArgs<Tret> &args) {
-    /* Determine if the method is valid by attempting to obtain an implementation specifying this method. */
-    GemmConfig cfg(method);
-
-    auto impl = find_implementation<Top, Tret>(args, &cfg);
-
-    if (impl) {
+    /* We didn't find an option matching the filters that recommended
+     * itself.  But if we found something earlier that matched the filters
+     * but wasn't recommended, return it here.  */
+    if (saved_impl != nullptr) {
+        impl = saved_impl;
         return true;
     }
 
     return false;
 }
 
-} // namespace arm_gemm
+template<typename Top, typename Tret>
+std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
+    std::vector<std::string> res;
+
+    auto gemms = gemm_implementation_list<Top, Tret>();
+
+    for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
+        /* Check that this implementation supports the presented problem. */
+        if (i->is_supported != nullptr && !i->is_supported(args)) {
+            continue;
+        }
+
+        res.push_back(i->name);
+    }
+
+    return res;
+}
+
+template<typename Top, typename Tret>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs<Tret> &args) {
+    const GemmImplementation<Top, Tret> *impl;
+
+    if (find_implementation<Top, Tret>(args, impl)) {
+        return UniqueGemmCommon<Top, Tret>(impl->instantiate(args));
+    }
+
+    return UniqueGemmCommon<Top, Tret>(nullptr);
+}
+
+template<typename Top, typename Tret>
+KernelDescription get_gemm_method(const GemmArgs<Tret> &args) {
+    const GemmImplementation<Top, Tret> *impl;
+
+    if (find_implementation<Top, Tret>(args, impl)) {
+        return KernelDescription(impl->method, impl->name);
+    }
+
+    /* This shouldn't happen - there should always be at least one valid implementation. */
+    return KernelDescription();
+}
+
+template<typename Top, typename Tret>
+bool method_is_compatible(GemmMethod method, const GemmArgs<Tret> &args) {
+    /* Determine if the method is valid by attempting to obtain an implementation specifying this method. */
+    GemmConfig       cfg(method);
+    GemmArgs<Tret>   myargs = args;
+
+    myargs._cfg = &cfg;
+
+    const GemmImplementation<Top, Tret> *impl;
+
+    return find_implementation<Top, Tret>(myargs, impl);
+}
+
+} // namespace arm_gemm
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index ad171a7..b4503dd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,30 +32,33 @@
 
 namespace arm_gemm {
 
-class GemmImpl_gemm_s16_interleaved : public GemmImplementation<int16_t, int32_t> {
-public:
-    UniqueGemmCommon<int16_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
-        return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args));
-    }
-
-    GemmImpl_gemm_s16_interleaved() : GemmImplementation<int16_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s16_interleaved gemm_s16_interleaved_impl{};
-
-static std::vector<GemmImplementation<int16_t, int32_t> *> gemm_s16_methods = {
-    &gemm_s16_interleaved_impl
+static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_s16_12x8",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+},
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
 };
 
 template<>
-std::vector<GemmImplementation<int16_t, int32_t> *> &gemm_implementation_list<int16_t, int32_t>() {
+const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, int32_t>() {
     return gemm_s16_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int16_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 627d8ab..5811c2a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,69 +25,78 @@
 
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
 
 #include "kernels/a64_gemm_s16_12x8.hpp"
 #include "kernels/a64_gemm_s8_12x8.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
+#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
 
 namespace arm_gemm {
 
+static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
-    UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
-        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args));
-    }
-
-    GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
-    bool is_supported(const GemmArgs<int32_t> &args) override {
-        return args._ci->has_dotprod();
-    }
-
-    UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
-        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args));
-    }
-
-    GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-
+{
+    GemmMethod::GEMM_NATIVE,
+    "native_s8s32_dot_4VLx4",
+    [](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+    [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs<int32_t> &args) { return new GemmNative<native_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "interleaved_s8s32_dot_3VLx8",
+    [](const GemmArgs<int32_t> &args) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+},
 #endif
-
-class GemmImpl_gemm_s8_interleaved : public GemmImplementation<int8_t, int32_t> {
-public:
-    UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
-        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args));
-    }
-
-    GemmImpl_gemm_s8_interleaved() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s8_interleaved_dot gemm_s8_interleaved_dot_impl{};
-static GemmImpl_gemm_s8_interleaved gemm_s8_interleaved_impl{};
-
-static std::vector<GemmImplementation<int8_t, int32_t> *> gemm_s8_methods = {
-    &gemm_s8_interleaved_dot_impl,
-    &gemm_s8_interleaved_impl
+{
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_s8s32_dot_16x4",
+    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_s8_12x8",
+    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_s8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+},
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
 };
 
 template<>
-std::vector<GemmImplementation<int8_t, int32_t> *> &gemm_implementation_list<int8_t, int32_t>() {
+const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int32_t>() {
     return gemm_s8_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int8_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 0e58a4d..b83ccd3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -318,50 +318,57 @@
 
     /* Constructor */
     GemmInterleaved(const GemmArgs<Tr> &args)
-                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                      _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
-                      _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
-                      _pretransposed(args._pretransposed_hint) {
+            : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+              _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
+              _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+              _pretransposed(args._pretransposed_hint) {
         const unsigned int L1_size = _ci->get_L1_cache_size();
         const unsigned int L2_size = _ci->get_L2_cache_size();
 
         assert(_maxthreads > 0);
 
-        // Work out blocking parameters
+        // Work out blocking parameters, or override from provided GemmConfig
+        if (args._cfg && args._cfg->inner_block_size) {
+            _k_block = args._cfg->inner_block_size;
+        } else {
+            // k_block: Find out how much of the larger array can be loaded into half the cache.
+            // This should account for associative caches.
+            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
 
-        // k_block: Find out how much of the larger array can be loaded into half the cache.
-        // This should account for associative caches.
-        _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+            // Needs to be (at least a single) multiple of the K unroll level.
+            _k_block /= strategy::k_unroll();
+            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
 
-        // Needs to be (at least a single) multiple of the K unroll level.
-        _k_block /= strategy::k_unroll();
-        _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+            // Now tune to presented problem size; this is how many blocks we need.
+            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
 
-        // Now tune to presented problem size; this is how many blocks we need.
-        int num_k_blocks = iceildiv(_Ksize, _k_block);
+            // So divide the space equally into that many blocks.
+            _k_block = iceildiv(_Ksize, num_k_blocks);
 
-        // So divide the space equally into that many blocks.
-        _k_block = iceildiv(_Ksize, num_k_blocks);
+            // And round UP to the K unroll level required.
+            _k_block = iceildiv(_k_block, strategy::k_unroll());
+            _k_block *= strategy::k_unroll();
+        }
 
-        // And round UP to the K unroll level required.
-        _k_block = iceildiv(_k_block, strategy::k_unroll());
-        _k_block *= strategy::k_unroll();
+        if (args._cfg && args._cfg->outer_block_size) {
+            _x_block = args._cfg->outer_block_size;
+        } else {
+            // x_block: Work out how many rows (of length k_block) will fit in the L2
+            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                       (sizeof(Toi) * _k_block);
 
-        // x_block: Work out how many rows (of length k_block) will fit in the L2
-        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                  (sizeof(Toi) * _k_block);
+            // Needs to be (at least a single) multiple of the kernel output width.
+            _x_block /= strategy::out_width();
+            _x_block = std::max(_x_block, 1U) * strategy::out_width();
 
-        // Needs to be (at least a single) multiple of the kernel output width.
-        _x_block /= strategy::out_width();
-        _x_block = std::max(_x_block, 1U) * strategy::out_width();
+            // And tune to the presented problem size.
+            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+            _x_block = iceildiv(_Nsize, num_x_blocks);
 
-        // And tune to the presented problem size.
-        int num_x_blocks = iceildiv(_Nsize, _x_block);
-        _x_block = iceildiv(_Nsize, num_x_blocks);
-
-        _x_block = iceildiv(_x_block, strategy::out_width());
-        _x_block *= strategy::out_width();
+            _x_block = iceildiv(_x_block, strategy::out_width());
+            _x_block *= strategy::out_width();
+        }
 
         // Work out the rounded size of M - needed for some buffers.
         _Mround = iceildiv(_Msize, strategy::out_height());
@@ -457,8 +464,8 @@
 
         do {
             /* Figure out the size of each block. */
-            size_t x_size = (current.xmax() - current.x0());
-            size_t k_size = (current.kmax() - current.k0());
+            unsigned int x_size = (current.xmax() - current.x0());
+            unsigned int k_size = (current.kmax() - current.k0());
 
             /* Round sizes up as needed. */
             x_size = iceildiv(x_size, strategy::out_width());
@@ -473,6 +480,7 @@
         return total;
     }
 
+    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
@@ -481,8 +489,8 @@
 
         do {
             /* Figure out the size of each block. */
-            size_t x_size = (current.xmax() - current.x0());
-            size_t k_size = (current.kmax() - current.k0());
+            unsigned int x_size = (current.xmax() - current.x0());
+            unsigned int k_size = (current.kmax() - current.k0());
 
             /* Round sizes up as needed. */
             x_size = iceildiv(x_size, strategy::out_width());

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index baa1316..98516b1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,7 @@
 
 #include "arm_gemm.hpp"
 
-#include "mergeresults.hpp"
-#include "transform.hpp"
+#include "ndrange.hpp"
 
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
@@ -55,35 +54,46 @@
     const unsigned int _nbatches;
     const unsigned int _nmultis;
 
-    Tr _beta;
+    const Tr _beta;
 
     const CPUInfo * const _ci;
 
-    unsigned int k_block=0;
-    unsigned int n_block=0;
+    const unsigned int _k_block;
+    const unsigned int _n_block;
 
-    unsigned int window_per_batch() const {
-        return iceildiv(_Msize, strategy::out_height());
+    const NDRange<4> _window_range;
+
+    static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+        return args._Ksize;
     }
 
-    unsigned int window_per_multi() const {
-        return window_per_batch() * _nbatches;
+    static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+        if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) {
+            return args._cfg->outer_block_size;
+        } else {
+            return args._Nsize;
+        }
     }
 
 public:
     GemmNative(GemmNative &) = delete;
     GemmNative & operator= (GemmNative &) = delete;
 
-    GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
-        _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
-        /* For now don't do any blocking. TODO: figure out if we should. */
-        k_block = K;
-        n_block = N;
-    }
+    GemmNative(const GemmArgs<Tr> &args)
+            : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+              _nbatches(args._nbatches), _nmultis(args._nmulti),
+              _beta(args._beta), _ci(args._ci),
+              _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+              _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
 
     // Window is amount per multi multiplied by total number of multis.
     unsigned int get_window_size() const override {
-        return window_per_multi() * _nmultis;
+        return _window_range.total_size();
+    }
+
+    // Native GEMMs can always be dynamically scheduled (whether requested or not)
+    bool supports_dynamic_scheduling() const override {
+        return true;
     }
 
     // Actually execute the GEMM.
@@ -96,40 +106,30 @@
         static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
 
-        /* Compute starting point based on 'start' */
-        unsigned int multi     = start / window_per_multi();
-        unsigned int multi_pos = start % window_per_multi();
+        auto p = _window_range.iterator(start, end);
 
-        unsigned int batch     = multi_pos / window_per_batch();
-        unsigned int batch_pos = multi_pos % window_per_batch();
+        if (p.done()) {
+            return;
+        }
 
-        unsigned int y0        = batch_pos * strategy::out_height();
+        do {
+            unsigned int y0    = p.dim(0) * strategy::out_height();
+            unsigned int ymax  = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+            unsigned int batch = p.dim(1);
+            unsigned int n0    = p.dim(2) * _n_block;
+            unsigned int nmax  = std::min(n0 + _n_block, _Nsize);
+            unsigned int multi = p.dim(3);
 
-        for (unsigned int pos=start; pos<end; pos++) {
-            const unsigned int ymax = std::min(y0 + strategy::out_height(), _Msize);
 #ifdef CYCLE_PROFILING
-            auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize);
+            auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize);
 #endif
 
             strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
-                         this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
-                         this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
-                         _beta, (ymax-y0), _Nsize, _Ksize);
-
-            /* Advance to next item */
-            y0 += strategy::out_height();
-
-            /* Check for batch/multi overflow */
-            if (y0 >= _Msize) {
-                y0=0;
-                batch++;
-                if (batch == _nbatches) {
-                    batch=0;
-                    multi++;
-                }
-            }
-        }
+                         this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb,
+                         this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc,
+                         _beta, (ymax-y0), (nmax - n0), _Ksize);
+        } while (p.next_dim1());
     }
 };
 
-} // namespace arm_gemm
+} // namespace arm_gemm
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index feea482..6bcbca9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,31 +32,34 @@
 
 namespace arm_gemm {
 
-class GemmImpl_gemm_u16_interleaved : public GemmImplementation<uint16_t, uint32_t> {
-public:
-    UniqueGemmCommon<uint16_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
-        return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args));
-    }
-
-    GemmImpl_gemm_u16_interleaved() : GemmImplementation<uint16_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u16_interleaved gemm_u16_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint16_t, uint32_t> *> gemm_u16_methods = {
-    &gemm_u16_interleaved_impl
+static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_u16_12x8",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+},
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
 };
 
 template<>
-std::vector<GemmImplementation<uint16_t, uint32_t> *> &gemm_implementation_list<uint16_t, uint32_t>() {
+const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t, uint32_t>() {
     return gemm_u16_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint16_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b7c1bab..b95ca80 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,64 +27,75 @@
 #include "gemm_common.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_hybrid.hpp"
+#include "gemm_native.hpp"
 
 #include "kernels/a64_gemm_u16_12x8.hpp"
 #include "kernels/a64_gemm_u8_12x8.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
+#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
 
 namespace arm_gemm {
 
+static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
-    UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
-        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args));
-    }
-
-    GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
-    bool is_supported(const GemmArgs<uint32_t> &args) override {
-        return args._ci->has_dotprod();
-    }
-
-    UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
-        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args));
-    }
-
-    GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
+{
+    GemmMethod::GEMM_NATIVE,
+    "native_u8u32_dot_4VLx4",
+    [](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+    [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs<uint32_t> &args) { return new GemmNative<native_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "interleaved_u8u32_dot_3VLx8",
+    [](const GemmArgs<uint32_t> &args) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+},
 #endif
-
-class GemmImpl_gemm_u8_interleaved : public GemmImplementation<uint8_t, uint32_t> {
-public:
-    UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
-        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args));
-    }
-
-    GemmImpl_gemm_u8_interleaved() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u8_interleaved_dot gemm_u8_interleaved_dot_impl{};
-static GemmImpl_gemm_u8_interleaved gemm_u8_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint8_t, uint32_t> *> gemm_u8_methods = {
-    &gemm_u8_interleaved_dot_impl,
-    &gemm_u8_interleaved_impl
+{
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_u8u32_dot_16x4",
+    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_u8_12x8",
+    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_u8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+},
+{
+    GemmMethod::DEFAULT,
+    "",
+    nullptr,
+    nullptr,
+    nullptr
+}
 };
 
 template<>
-std::vector<GemmImplementation<uint8_t, uint32_t> *> &gemm_implementation_list<uint8_t, uint32_t>() {
+const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, uint32_t>() {
     return gemm_u8_methods;
 }
 
 /* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint8_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index d65971e..32d668f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,9 +41,10 @@
         GemmArgs<Tr> newargs = args;
         newargs._Msize = args._nbatches;
         newargs._nbatches = 1;
-        _subgemm = gemm<To,Tr>(newargs, nullptr);
+        _subgemm = gemm<To,Tr>(newargs);
     }
 
+    using GemmCommon<To, Tr>::set_arrays;
     void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
                     const To *B, const int ldb, const int B_multi_stride,
                           Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
@@ -85,6 +86,7 @@
         return _subgemm->get_B_pretransposed_array_size();
     }
 
+    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
     }

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 241c5fe..5ebc634 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,15 +64,16 @@
     GemvNativeTransposed(GemvNativeTransposed &) = delete;
     GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
 
-    GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
+    GemvNativeTransposed(const GemmArgs<Tr> &args)
+            : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) {
         /* For now don't do any blocking. TODO: figure out if we should. */
-        m_block = K;
-        n_block = N;
+        m_block = _Ksize;
+        n_block = _Nsize;
     }
 
     // Window is number of out_width blocks times number of multis.
     unsigned int get_window_size() const override {
-        return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+        return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
     }
 
     // Actually execute the GEMV.
@@ -82,12 +83,12 @@
 #endif
         strategy strat(_ci);
 
-        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
         const unsigned int multi_0   = start / window_per_multi;
         const unsigned int multi_end = end   / window_per_multi;
 
-        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
-        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
 
         static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index e53ddb2..f7beb0a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,17 +68,26 @@
     GemvPretransposed(GemvPretransposed &) = delete;
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
-    GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
-        _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
-        _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
+    GemvPretransposed(const GemmArgs<Tr> &args)
+            : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci),
+              _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
         /* For now don't do any blocking. TODO: figure out if we should. */
-        m_block = K;
-        n_block = N;
+        if (args._cfg && args._cfg->inner_block_size) {
+            m_block = args._cfg->inner_block_size;
+        } else {
+            m_block = _Ksize;
+        }
+
+        if (args._cfg && args._cfg->outer_block_size) {
+            n_block = args._cfg->outer_block_size;
+        } else {
+            n_block = _Nsize;
+        }
     }
 
     // Window is number of out_width blocks, times number of multis.
     unsigned int get_window_size() const override {
-        return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+        return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
     }
 
     // Actually execute the GEMV.
@@ -89,13 +98,13 @@
         strategy strat(_ci);
 
         /* Break the window values down into multis of interest... */
-        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
         const unsigned int multi_0    = start / window_per_multi;
         const unsigned int multi_end  = end   / window_per_multi;
 
         /* ... and figure out where we start and end in the first and last multi. */
-        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
-        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
 
         static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
 
@@ -115,8 +124,8 @@
                     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
 #endif
                     /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
-                    strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
-                                 (_Ksize * strategy::A_interleave),
+                    strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
+                                 (_Ksize * strategy::A_interleave()),
                                  this->_Aptr + (multi * this->_A_multi_stride) + m0,
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
                                  _beta, (mmax-m0), (nmax-n));
@@ -139,6 +148,7 @@
         return _buffer_per_multi * _nmultis * sizeof(To);
     }
 
+    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
 
@@ -146,10 +156,10 @@
             /* Reverse sense here as we are dealing with B rather than A.  So if
              * strategy::A_transpose is false and _trB is false, we still
              * transpose.  */
-            if (_trB ^ strategy::A_transpose) {
-                Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+            if (_trB ^ strategy::A_transpose()) {
+                Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
             } else {
-                Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+                Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
             }
         }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index 06e6245..2349722 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,15 +50,15 @@
     typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 8;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 6;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 95a2bc2..2fcb587 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,15 +48,15 @@
     typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 12;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index fdc0200..cc205dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,15 +43,15 @@
     typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 12;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 4;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index be7ead9..71c666a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,15 +42,15 @@
     typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 4;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 4;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 16;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index d2692ba..3d5c92c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,15 +48,15 @@
     typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 12;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index a252abf..9032ba6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,15 +53,15 @@
     static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 12;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 4;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 2da3ecd..fda7657 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,15 +50,15 @@
     static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 4;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 4;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 16;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 911a4eb..5b850b7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,15 +47,15 @@
     typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 24;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 418a375..4ad38cb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,9 +32,9 @@
 // Kernel implementation.
 //
 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
 // Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
+// 24x8), the chunks being arranged in a row major fashion.
 //
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
new file mode 100644
index 0000000..c8934df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_16x4
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+
+    hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+    {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
+            kernel = a64_hybrid_s8s32_dot_16x4_a55;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
new file mode 100644
index 0000000..48bf842
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp

@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+
+    for (int y=0; y<M; y+=4) {
+        const int8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(int8_t);
+
+        int32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(int32_t);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const int32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            const int8_t *a_ptr0 = a_ptr0_base;
+            const int8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "temploadreg0 .req X0\n"
+                        "temploadreg1 .req X1\n"
+                        "temploadreg2 .req X2\n"
+                        "temploadreg3 .req X3\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "temploadreg0 .req X2\n"
+                        "temploadreg1 .req X3\n"
+                        "temploadreg2 .req X4\n"
+                        "temploadreg3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "temploadreg0 .req X4\n"
+                        "temploadreg1 .req X5\n"
+                        "temploadreg2 .req X6\n"
+                        "temploadreg3 .req X7\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr d2, [a_ptr2, #0x10]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "temploadreg0 .req X6\n"
+                        "temploadreg1 .req X7\n"
+                        "temploadreg2 .req X8\n"
+                        "temploadreg3 .req X9\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v28.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "mul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr d3, [a_ptr3, #-0x10]\n"
+                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr d2, [a_ptr2, #0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
+                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr d3, [a_ptr3, #0x10]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
new file mode 100644
index 0000000..0179139
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp

@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+
+    for (int y=0; y<M; y+=4) {
+        const int8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(int8_t);
+
+        int32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(int32_t);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const int32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            const int8_t *a_ptr0 = a_ptr0_base;
+            const int8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v27.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q2, [a_ptr2, #0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v28.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "mul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q3, [a_ptr3, #-0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q2, [a_ptr2, #0x10]\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q3, [a_ptr3, #0x10]\n"
+                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
new file mode 100644
index 0000000..7fb9b5c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_16x4
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+
+    hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+    {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
+            kernel = a64_hybrid_u8u32_dot_16x4_a55;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
new file mode 100644
index 0000000..230ecdc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp

@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0u);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+
+    for (int y=0; y<M; y+=4) {
+        const uint8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(uint8_t);
+
+        uint32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const uint32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            const uint8_t *a_ptr0 = a_ptr0_base;
+            const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "temploadreg0 .req X0\n"
+                        "temploadreg1 .req X1\n"
+                        "temploadreg2 .req X2\n"
+                        "temploadreg3 .req X3\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "temploadreg0 .req X2\n"
+                        "temploadreg1 .req X3\n"
+                        "temploadreg2 .req X4\n"
+                        "temploadreg3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "temploadreg0 .req X4\n"
+                        "temploadreg1 .req X5\n"
+                        "temploadreg2 .req X6\n"
+                        "temploadreg3 .req X7\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr d2, [a_ptr2, #0x10]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "temploadreg0 .req X6\n"
+                        "temploadreg1 .req X7\n"
+                        "temploadreg2 .req X8\n"
+                        "temploadreg3 .req X9\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v28.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "mul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr d3, [a_ptr3, #-0x10]\n"
+                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr d0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr d1, [a_ptr1, #0x10]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr d2, [a_ptr2, #0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
+                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr d3, [a_ptr3, #0x10]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
new file mode 100644
index 0000000..dbef029
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp

@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0u);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+
+    for (int y=0; y<M; y+=4) {
+        const uint8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(uint8_t);
+
+        uint32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const uint32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            const uint8_t *a_ptr0 = a_ptr0_base;
+            const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v27.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q2, [a_ptr2, #0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v28.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "mul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "mul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "mul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "mul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "mul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "mul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "mul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "mul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "mul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "mul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "mul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "mul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "mul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q3, [a_ptr3, #-0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "cbz %[regs], 4f\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        "ldr q0, [%[a_ptr0], #0x10]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        "ldr q1, [a_ptr1, #0x10]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        "ldr q2, [a_ptr2, #0x10]\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                        "ldr q3, [a_ptr3, #0x10]\n"
+                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                        "5:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 10d1069..3c0395a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,15 +51,15 @@
     typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 12;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 8;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
new file mode 100644
index 0000000..95e3712
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+// Native A/Pretranspose B SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPUInfo
+// structure.
+class sgemm_nativeA_pretransposeB_16x4 {
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+    /* Desired data layout for B buffer (used for pretranspose) */
+    static const int  B_interleave = 16;
+    static const int  B_block = 1;
+    static const bool B_transpose = true;
+
+    /* Kernel blocking parameters */
+    static unsigned int out_width() {
+        return 16;
+    }
+
+    static unsigned int out_height() {
+        return 4;
+    }
+
+    static unsigned int k_unroll() {
+        return 1;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 4, 16> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4;
+
+    sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
new file mode 100644
index 0000000..b2516f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp

@@ -0,0 +1,970 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+
+#include <arm_neon.h>
+
+namespace arm_gemm {
+
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) {
+    const bool         oddk    = ((K % 8) >= 4);
+    const bool         beta0   = (beta == 0.0f);
+    const unsigned int oddones = (K % 4);
+
+    /* Use some small temporary arrays to cope with "ragged" M/N sizes.
+     *
+     * "dummy_A_buf" is used to avoid overreading the A input for ragged M,
+     * and also for output if N is not ragged.
+     *
+     * Since the B input is pretransposed it will be padded as needed, so no
+     * need to worry about overreading that.
+     *
+     * "C_buf" is used to avoid overreading or overwriting the output for
+     * ragged N cases.
+     */
+    float dummy_A_buf[16];
+    float C_buf[64];
+
+    std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf));
+    std::memset(C_buf, 0, sizeof(C_buf));
+
+    for (unsigned int y=0; y<numrows; y+=4) {
+        const float *b_ptr = B_panel;
+        const unsigned int active_rows = std::min(numrows - y, 4U);
+
+        /* Increment values to be used to advance A pointers - these get set
+         * to zero when the corresponding row isn't being used due to ragged
+         * M, so it will just read the dummy buffer repeatedly.  Values are
+         * in bytes (8x sizeof(float)).  */
+        const unsigned long a_incr1 = (active_rows > 1) ? 32 : 0;
+        const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0;
+        const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0;
+
+        /* Starting points for A pointers on this loop */
+        const float * const a_ptr0_base = A + (y * lda);
+        const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf;
+        const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf;
+        const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf;
+
+        /* Starting points for C pointers on this loop */
+        float *c_ptr0 = C + (y * ldc);
+        float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf;
+        float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf;
+        float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf;
+
+        for (unsigned int x0=0; x0<numcols; x0+=16) {
+            const unsigned int active_cols = std::min(numcols - x0, 16U);
+            const bool use_result_buf = (active_cols < 16);
+
+            /* Reset the A pointers for this loop. */
+            const float *a_ptr0 = a_ptr0_base;
+            const float *a_ptr1 = a_ptr1_base;
+            const float *a_ptr2 = a_ptr2_base;
+            const float *a_ptr3 = a_ptr3_base;
+
+            /* Override C pointers if the result buffer is in use. */
+            if (use_result_buf) {
+                c_ptr0 = C_buf;
+                c_ptr1 = C_buf + 16;
+                c_ptr2 = C_buf + 32;
+                c_ptr3 = C_buf + 48;
+
+                /* If beta is non-zero, prepopulate the result buffer */
+                if (!beta0) {
+                    for (unsigned int row=0; row<active_rows; row++) {
+                        for (unsigned int col=0; col<active_cols; col++) {
+                            C_buf[row * 16 + col] = C[((y + row) * ldc) + (x0 + col)];
+                        }
+                    }
+                }
+            }
+
+            unsigned int loops = ((K+4)/8) - 1;
+            unsigned int odds = oddones;
+
+            __asm __volatile (
+            "a0   .req v0\n"
+            "a1   .req v1\n"
+            "a2   .req v2\n"
+            "a3   .req v3\n"
+            "a0a  .req v4\n"
+            "a1a  .req v5\n"
+            "a2a  .req v6\n"
+            "a3a  .req v7\n"
+            "bb0  .req v8\n"
+            "bb1  .req v9\n"
+            "bb2  .req v10\n"
+            "bb3  .req v11\n"
+            "b0a  .req v12\n"
+            "b1a  .req v13\n"
+            "b2a  .req v14\n"
+            "b3a  .req v15\n"
+
+            "a0q  .req q0\n"
+            "a1q  .req q1\n"
+            "a2q  .req q2\n"
+            "a3q  .req q3\n"
+            "a0aq .req q4\n"
+            "a1aq .req q5\n"
+            "a2aq .req q6\n"
+            "a3aq .req q7\n"
+            "b0q  .req q8\n"
+            "b1q  .req q9\n"
+            "b2q  .req q10\n"
+            "b3q  .req q11\n"
+            "b0aq .req q12\n"
+            "b1aq .req q13\n"
+            "b2aq .req q14\n"
+            "b3aq .req q15\n"
+
+            "movi	v16.4s, #0x0\n"
+            "ldr	a0q, [%[a_ptr0]]\n"
+            "movi	v17.4s, #0x0\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+            "movi	v18.4s, #0x0\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+            "movi	v19.4s, #0x0\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+            "movi	v20.4s, #0x0\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "movi	v21.4s, #0x0\n"
+            "ldr	a1q, [%[a_ptr1]]\n"
+            "movi	v22.4s, #0x0\n"
+            "ldr	a2q, [%[a_ptr2]]\n"
+            "movi	v23.4s, #0x0\n"
+            "ldr	a3q, [%[a_ptr3]]\n"
+            "movi	v24.4s, #0x0\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+            "movi	v25.4s, #0x0\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+            "movi	v26.4s, #0x0\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+            "cbz	%w[beta0], 5f\n"
+            "movi	v27.4s, #0x0\n"
+            ASM_PREFETCH("[%[b_ptr], #0x40]")
+            "movi	v28.4s, #0x0\n"
+            ASM_PREFETCH("[%[b_ptr], #0x80]")
+            "movi	v29.4s, #0x0\n"
+            ASM_PREFETCH("[%[b_ptr], #0xC0]")
+            "movi	v30.4s, #0x0\n"
+            ASM_PREFETCH("[%[b_ptr], #0x100]")
+            "movi	v31.4s, #0x0\n"
+            ASM_PREFETCH("[%[b_ptr], #0x140]")
+            ASM_PREFETCH("[%[b_ptr], #0x180]")
+            ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+            ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+            // Skip if no complete loops.
+            "cbz	%w[loops], 4f\n"
+            "b	1f\n"
+
+            // If beta is non-zero, need to load and multiply by beta
+            "5:\n"
+            "ld1r	{v4.4s}, [%[betaptr]]\n"
+            "ldr	q16, [%[c_ptr0]]\n"
+            "ldr	q17, [%[c_ptr0], #16]\n"
+            "ldr	q18, [%[c_ptr0], #32]\n"
+            "ldr	q19, [%[c_ptr0], #48]\n"
+
+            "ldr	q20, [%[c_ptr1]]\n"
+            "fmul	v16.4s, v16.4s, v4.4s\n"
+            "ldr	q21, [%[c_ptr1], #16]\n"
+            "fmul	v17.4s, v17.4s, v4.4s\n"
+            "ldr	q22, [%[c_ptr1], #32]\n"
+            "fmul	v18.4s, v18.4s, v4.4s\n"
+            "ldr	q23, [%[c_ptr1], #48]\n"
+            "fmul	v19.4s, v19.4s, v4.4s\n"
+
+            "ldr	q24, [%[c_ptr2]]\n"
+            "fmul	v20.4s, v20.4s, v4.4s\n"
+            "ldr	q25, [%[c_ptr2], #16]\n"
+            "fmul	v21.4s, v21.4s, v4.4s\n"
+            "ldr	q26, [%[c_ptr2], #32]\n"
+            "fmul	v22.4s, v22.4s, v4.4s\n"
+            "ldr	q27, [%[c_ptr2], #48]\n"
+            "fmul	v23.4s, v23.4s, v4.4s\n"
+
+            "ldr	q28, [%[c_ptr3]]\n"
+            "fmul	v24.4s, v24.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x40]")
+            "ldr	q29, [%[c_ptr3], #16]\n"
+            "fmul	v25.4s, v25.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x80]")
+            "ldr	q30, [%[c_ptr3], #32]\n"
+            "fmul	v26.4s, v26.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0xC0]")
+            "ldr	q31, [%[c_ptr3], #48]\n"
+            "fmul	v27.4s, v27.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x100]")
+
+            "fmul	v28.4s, v28.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x140]")
+            "fmul	v29.4s, v29.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x180]")
+            "fmul	v30.4s, v30.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+            "fmul	v31.4s, v31.4s, v4.4s\n"
+            ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+            "cbz	%w[loops], 4f\n"
+
+            "1:\n"
+            // Unroll 0
+            "fmla	v16.4s, bb0.4s, a0.s[0]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x240]")
+            "fmla	v20.4s, bb0.4s, a1.s[0]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+            "fmla	v24.4s, bb0.4s, a2.s[0]\n"
+            "fmla	v28.4s, bb0.4s, a3.s[0]\n"
+            "ldr	b0q, [%[b_ptr], #64]\n"
+
+            "fmla	v17.4s, bb1.4s, a0.s[0]\n"
+            "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+            "ldr	a0aq, [%[a_ptr0], #16]\n"
+            "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+            "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+            "ldr	b1q, [%[b_ptr], #80]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+            "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+            "ldr	a1aq, [%[a_ptr1], #16]\n"
+            "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+            "ldr	b2q, [%[b_ptr], #96]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+            "ldr	a2aq, [%[a_ptr2], #16]\n"
+            "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+            "ldr	b3q, [%[b_ptr], #112]\n"
+
+            // Unroll 1
+            "fmla	v16.4s, b0a.4s, a0.s[1]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x280]")
+            "fmla	v20.4s, b0a.4s, a1.s[1]\n"
+            "ldr	a3aq, [%[a_ptr3], #16]\n"
+            "fmla	v24.4s, b0a.4s, a2.s[1]\n"
+            "fmla	v28.4s, b0a.4s, a3.s[1]\n"
+            "ldr	b0aq, [%[b_ptr], #128]\n"
+
+            "fmla	v17.4s, b1a.4s, a0.s[1]\n"
+            "fmla	v21.4s, b1a.4s, a1.s[1]\n"
+            "subs	%w[loops], %w[loops], #1\n"
+            "fmla	v25.4s, b1a.4s, a2.s[1]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[1]\n"
+            "ldr	b1aq, [%[b_ptr], #144]\n"
+
+            "fmla	v18.4s, b2a.4s, a0.s[1]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[1]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[1]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[1]\n"
+            "ldr	b2aq, [%[b_ptr], #160]\n"
+
+            "fmla	v19.4s, b3a.4s, a0.s[1]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[1]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[1]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[1]\n"
+            "ldr	b3aq, [%[b_ptr], #176]\n"
+
+            // Unroll 2
+            "fmla	v16.4s, bb0.4s, a0.s[2]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x2C0]")
+            "fmla	v20.4s, bb0.4s, a1.s[2]\n"
+            "fmla	v24.4s, bb0.4s, a2.s[2]\n"
+            "fmla	v28.4s, bb0.4s, a3.s[2]\n"
+            "ldr	b0q, [%[b_ptr], #192]\n"
+
+            "fmla	v17.4s, bb1.4s, a0.s[2]\n"
+            "add	%[a_ptr0], %[a_ptr0], #32\n"
+            "fmla	v21.4s, bb1.4s, a1.s[2]\n"
+            "add	%[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+            "fmla	v25.4s, bb1.4s, a2.s[2]\n"
+            "add	%[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+            "fmla	v29.4s, bb1.4s, a3.s[2]\n"
+            "ldr	b1q, [%[b_ptr], #208]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[2]\n"
+            "add	%[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+            "fmla	v22.4s, bb2.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[a_ptr0], #0x40]")
+            "fmla	v26.4s, bb2.4s, a2.s[2]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[2]\n"
+            "ldr	b2q, [%[b_ptr], #224]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[2]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[a_ptr1], #0x40]")
+            "fmla	v27.4s, bb3.4s, a2.s[2]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[2]\n"
+            "ldr	b3q, [%[b_ptr], #240]\n"
+
+            // Unroll 3
+            "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+            "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+            "add	%[b_ptr], %[b_ptr], #512\n"
+            "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+            "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+            "ldr	b0aq, [%[b_ptr], #-256]\n"
+
+            "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x100]")
+            "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+            "ldr	b1aq, [%[b_ptr], #-240]\n"
+
+            "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+            ASM_PREFETCH("[%[a_ptr2], #0x40]")
+            "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+            "ldr	b2aq, [%[b_ptr], #-224]\n"
+
+            "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+            "ldr	a0q, [%[a_ptr0]]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+            "ldr	b3aq, [%[b_ptr], #-208]\n"
+
+            // Unroll 4
+            "fmla	v16.4s, bb0.4s, a0a.s[0]\n"
+            "fmla	v20.4s, bb0.4s, a1a.s[0]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x140]")
+            "fmla	v24.4s, bb0.4s, a2a.s[0]\n"
+            "fmla	v28.4s, bb0.4s, a3a.s[0]\n"
+            "ldr	b0q, [%[b_ptr], #-192]\n"
+
+            "fmla	v17.4s, bb1.4s, a0a.s[0]\n"
+            "fmla	v21.4s, bb1.4s, a1a.s[0]\n"
+            "ldr	a1q, [%[a_ptr1]]\n"
+            "fmla	v25.4s, bb1.4s, a2a.s[0]\n"
+            "fmla	v29.4s, bb1.4s, a3a.s[0]\n"
+            "ldr	b1q, [%[b_ptr], #-176]\n"
+
+            "fmla	v18.4s, bb2.4s, a0a.s[0]\n"
+            "fmla	v22.4s, bb2.4s, a1a.s[0]\n"
+            "ldr	a2q, [%[a_ptr2]]\n"
+            "fmla	v26.4s, bb2.4s, a2a.s[0]\n"
+            "fmla	v30.4s, bb2.4s, a3a.s[0]\n"
+            "ldr	b2q, [%[b_ptr], #-160]\n"
+
+            "fmla	v19.4s, bb3.4s, a0a.s[0]\n"
+            "fmla	v23.4s, bb3.4s, a1a.s[0]\n"
+            "ldr	a3q, [%[a_ptr3]]\n"
+            "fmla	v27.4s, bb3.4s, a2a.s[0]\n"
+            "fmla	v31.4s, bb3.4s, a3a.s[0]\n"
+            "ldr	b3q, [%[b_ptr], #-144]\n"
+
+            // Unroll 5
+            "fmla	v16.4s, b0a.4s, a0a.s[1]\n"
+            "fmla	v20.4s, b0a.4s, a1a.s[1]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x180]")
+            "fmla	v24.4s, b0a.4s, a2a.s[1]\n"
+            "fmla	v28.4s, b0a.4s, a3a.s[1]\n"
+            "ldr	b0aq, [%[b_ptr], #-128]\n"
+
+            "fmla	v17.4s, b1a.4s, a0a.s[1]\n"
+            "fmla	v21.4s, b1a.4s, a1a.s[1]\n"
+            ASM_PREFETCH("[%[a_ptr3], #0x40]")
+            "fmla	v25.4s, b1a.4s, a2a.s[1]\n"
+            "fmla	v29.4s, b1a.4s, a3a.s[1]\n"
+            "ldr	b1aq, [%[b_ptr], #-112]\n"
+
+            "fmla	v18.4s, b2a.4s, a0a.s[1]\n"
+            "fmla	v22.4s, b2a.4s, a1a.s[1]\n"
+            "fmla	v26.4s, b2a.4s, a2a.s[1]\n"
+            "fmla	v30.4s, b2a.4s, a3a.s[1]\n"
+            "ldr	b2aq, [%[b_ptr], #-96]\n"
+
+            "fmla	v19.4s, b3a.4s, a0a.s[1]\n"
+            "fmla	v23.4s, b3a.4s, a1a.s[1]\n"
+            "fmla	v27.4s, b3a.4s, a2a.s[1]\n"
+            "fmla	v31.4s, b3a.4s, a3a.s[1]\n"
+            "ldr	b3aq, [%[b_ptr], #-80]\n"
+
+            // Unroll 6
+            "fmla	v16.4s, bb0.4s, a0a.s[2]\n"
+            "fmla	v20.4s, bb0.4s, a1a.s[2]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+            "fmla	v24.4s, bb0.4s, a2a.s[2]\n"
+            "fmla	v28.4s, bb0.4s, a3a.s[2]\n"
+            "ldr	b0q, [%[b_ptr], #-64]\n"
+
+            "fmla	v17.4s, bb1.4s, a0a.s[2]\n"
+            "fmla	v21.4s, bb1.4s, a1a.s[2]\n"
+            "fmla	v25.4s, bb1.4s, a2a.s[2]\n"
+            "fmla	v29.4s, bb1.4s, a3a.s[2]\n"
+            "ldr	b1q, [%[b_ptr], #-48]\n"
+
+            "fmla	v18.4s, bb2.4s, a0a.s[2]\n"
+            "fmla	v22.4s, bb2.4s, a1a.s[2]\n"
+            "fmla	v26.4s, bb2.4s, a2a.s[2]\n"
+            "fmla	v30.4s, bb2.4s, a3a.s[2]\n"
+            "ldr	b2q, [%[b_ptr], #-32]\n"
+
+            "fmla	v19.4s, bb3.4s, a0a.s[2]\n"
+            "fmla	v23.4s, bb3.4s, a1a.s[2]\n"
+            "fmla	v27.4s, bb3.4s, a2a.s[2]\n"
+            "fmla	v31.4s, bb3.4s, a3a.s[2]\n"
+            "ldr	b3q, [%[b_ptr], #-16]\n"
+
+            // Unroll 7
+            "fmla	v16.4s, b0a.4s, a0a.s[3]\n"
+            "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+            "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+            "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, b1a.4s, a0a.s[3]\n"
+            "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+            ASM_PREFETCH("[%[b_ptr], #0x200]")
+            "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+            "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, b2a.4s, a0a.s[3]\n"
+            "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+            "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+            "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, b3a.4s, a0a.s[3]\n"
+            "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+            "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+            "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
+            "bne	1b\n"
+
+            // Skip to here
+            "4:\n"
+
+            // Detached final iteration
+            // Unroll 0
+            "fmla	v16.4s, bb0.4s, a0.s[0]\n"
+            "fmla	v20.4s, bb0.4s, a1.s[0]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+            "fmla	v24.4s, bb0.4s, a2.s[0]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v28.4s, bb0.4s, a3.s[0]\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, bb1.4s, a0.s[0]\n"
+            "cbnz	%w[oddk], 2f\n" // Deal with odd K before we load a0a
+            "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+            "ldr	a0aq, [%[a_ptr0], #16]\n"
+            "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+            "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+            "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+            "ldr	a1aq, [%[a_ptr1], #16]\n"
+            "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+            "ldr	a2aq, [%[a_ptr2], #16]\n"
+            "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            // Unroll 1
+            "fmla	v16.4s, b0a.4s, a0.s[1]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v20.4s, b0a.4s, a1.s[1]\n"
+            "ldr	a3aq, [%[a_ptr3], #16]\n"
+            "fmla	v24.4s, b0a.4s, a2.s[1]\n"
+            "fmla	v28.4s, b0a.4s, a3.s[1]\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, b1a.4s, a0.s[1]\n"
+            "add	%[a_ptr0], %[a_ptr0], #32\n"
+            "fmla	v21.4s, b1a.4s, a1.s[1]\n"
+            "add	%[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[1]\n"
+            "add	%[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[1]\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, b2a.4s, a0.s[1]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[1]\n"
+            "add	%[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[1]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[1]\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, b3a.4s, a0.s[1]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[1]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[1]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[1]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+
+            // Unroll 2
+            "fmla	v16.4s, bb0.4s, a0.s[2]\n"
+            "fmla	v20.4s, bb0.4s, a1.s[2]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v24.4s, bb0.4s, a2.s[2]\n"
+            "fmla	v28.4s, bb0.4s, a3.s[2]\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, bb1.4s, a0.s[2]\n"
+            "fmla	v21.4s, bb1.4s, a1.s[2]\n"
+            "fmla	v25.4s, bb1.4s, a2.s[2]\n"
+            "fmla	v29.4s, bb1.4s, a3.s[2]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[2]\n"
+            "fmla	v22.4s, bb2.4s, a1.s[2]\n"
+            "fmla	v26.4s, bb2.4s, a2.s[2]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[2]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[2]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[2]\n"
+            "fmla	v27.4s, bb3.4s, a2.s[2]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[2]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            // Unroll 3
+            "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+            "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+            "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+            "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+
+            // Unroll 4
+            "fmla	v16.4s, bb0.4s, a0a.s[0]\n"
+            "fmla	v20.4s, bb0.4s, a1a.s[0]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v24.4s, bb0.4s, a2a.s[0]\n"
+            "fmla	v28.4s, bb0.4s, a3a.s[0]\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, bb1.4s, a0a.s[0]\n"
+            "fmla	v21.4s, bb1.4s, a1a.s[0]\n"
+            "fmla	v25.4s, bb1.4s, a2a.s[0]\n"
+            "fmla	v29.4s, bb1.4s, a3a.s[0]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, bb2.4s, a0a.s[0]\n"
+            "fmla	v22.4s, bb2.4s, a1a.s[0]\n"
+            "fmla	v26.4s, bb2.4s, a2a.s[0]\n"
+            "fmla	v30.4s, bb2.4s, a3a.s[0]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, bb3.4s, a0a.s[0]\n"
+            "fmla	v23.4s, bb3.4s, a1a.s[0]\n"
+            "fmla	v27.4s, bb3.4s, a2a.s[0]\n"
+            "fmla	v31.4s, bb3.4s, a3a.s[0]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            // Unroll 5
+            "fmla	v16.4s, b0a.4s, a0a.s[1]\n"
+            "fmla	v20.4s, b0a.4s, a1a.s[1]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v24.4s, b0a.4s, a2a.s[1]\n"
+            "fmla	v28.4s, b0a.4s, a3a.s[1]\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, b1a.4s, a0a.s[1]\n"
+            "fmla	v21.4s, b1a.4s, a1a.s[1]\n"
+            "fmla	v25.4s, b1a.4s, a2a.s[1]\n"
+            "fmla	v29.4s, b1a.4s, a3a.s[1]\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, b2a.4s, a0a.s[1]\n"
+            "fmla	v22.4s, b2a.4s, a1a.s[1]\n"
+            "fmla	v26.4s, b2a.4s, a2a.s[1]\n"
+            "fmla	v30.4s, b2a.4s, a3a.s[1]\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, b3a.4s, a0a.s[1]\n"
+            "fmla	v23.4s, b3a.4s, a1a.s[1]\n"
+            "fmla	v27.4s, b3a.4s, a2a.s[1]\n"
+            "fmla	v31.4s, b3a.4s, a3a.s[1]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+
+            // Unroll 6
+            "fmla	v16.4s, bb0.4s, a0a.s[2]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v20.4s, bb0.4s, a1a.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr0], #0x40]")
+            "fmla	v24.4s, bb0.4s, a2a.s[2]\n"
+            "fmla	v28.4s, bb0.4s, a3a.s[2]\n"
+
+            "fmla	v17.4s, bb1.4s, a0a.s[2]\n"
+            "fmla	v21.4s, bb1.4s, a1a.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr1], #0x40]")
+            "fmla	v25.4s, bb1.4s, a2a.s[2]\n"
+            "fmla	v29.4s, bb1.4s, a3a.s[2]\n"
+
+            "fmla	v18.4s, bb2.4s, a0a.s[2]\n"
+            "fmla	v22.4s, bb2.4s, a1a.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr2], #0x40]")
+            "fmla	v26.4s, bb2.4s, a2a.s[2]\n"
+            "fmla	v30.4s, bb2.4s, a3a.s[2]\n"
+
+            "fmla	v19.4s, bb3.4s, a0a.s[2]\n"
+            "fmla	v23.4s, bb3.4s, a1a.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr3], #0x40]")
+            "fmla	v27.4s, bb3.4s, a2a.s[2]\n"
+            "fmla	v31.4s, bb3.4s, a3a.s[2]\n"
+
+            // Unroll 7
+            "fmla	v16.4s, b0a.4s, a0a.s[3]\n"
+            "fmla	v17.4s, b1a.4s, a0a.s[3]\n"
+            "fmla	v18.4s, b2a.4s, a0a.s[3]\n"
+            "fmla	v19.4s, b3a.4s, a0a.s[3]\n"
+            "cbnz	%w[odds], 6f\n"
+
+            "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+            "str	q16, [%[c_ptr0]]\n"
+            "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+            "str	q17, [%[c_ptr0], #16]\n"
+            "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+            "str	q18, [%[c_ptr0], #32]\n"
+            "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+            "str	q19, [%[c_ptr0], #48]\n"
+
+            "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+            "str	q20, [%[c_ptr1]]\n"
+            "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+            "str	q21, [%[c_ptr1], #16]\n"
+            "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+            "str	q22, [%[c_ptr1], #32]\n"
+            "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+            "str	q23, [%[c_ptr1], #48]\n"
+
+            "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+            "str	q24, [%[c_ptr2]]\n"
+            "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+            "str	q25, [%[c_ptr2], #16]\n"
+            "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+            "str	q26, [%[c_ptr2], #32]\n"
+            "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
+            "str	q27, [%[c_ptr2], #48]\n"
+            "b	3f\n"
+
+            // Odd K case: Just do 4 more.
+            "2:\n"
+            "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+            "add	%[a_ptr0], %[a_ptr0], #16\n"
+            "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+            "add	%[a_ptr1], %[a_ptr1], #16\n"
+            "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+            "add	%[a_ptr2], %[a_ptr2], #16\n"
+            "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+            "add	%[a_ptr3], %[a_ptr3], #16\n"
+            "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+            "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            // Unroll 1
+            "fmla	v16.4s, b0a.4s, a0.s[1]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v20.4s, b0a.4s, a1.s[1]\n"
+            "fmla	v24.4s, b0a.4s, a2.s[1]\n"
+            "fmla	v28.4s, b0a.4s, a3.s[1]\n"
+            "ldr	b0aq, [%[b_ptr]]\n"
+
+            "fmla	v17.4s, b1a.4s, a0.s[1]\n"
+            "fmla	v21.4s, b1a.4s, a1.s[1]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[1]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[1]\n"
+            "ldr	b1aq, [%[b_ptr], #16]\n"
+
+            "fmla	v18.4s, b2a.4s, a0.s[1]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[1]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[1]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[1]\n"
+            "ldr	b2aq, [%[b_ptr], #32]\n"
+
+            "fmla	v19.4s, b3a.4s, a0.s[1]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[1]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[1]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[1]\n"
+            "ldr	b3aq, [%[b_ptr], #48]\n"
+
+            // Unroll 2
+            "fmla	v16.4s, bb0.4s, a0.s[2]\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v20.4s, bb0.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr0], #0x40]")
+            "fmla	v24.4s, bb0.4s, a2.s[2]\n"
+            "fmla	v28.4s, bb0.4s, a3.s[2]\n"
+
+            "fmla	v17.4s, bb1.4s, a0.s[2]\n"
+            "fmla	v21.4s, bb1.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr1], #0x40]")
+            "fmla	v25.4s, bb1.4s, a2.s[2]\n"
+            "fmla	v29.4s, bb1.4s, a3.s[2]\n"
+
+            "fmla	v18.4s, bb2.4s, a0.s[2]\n"
+            "fmla	v22.4s, bb2.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr2], #0x40]")
+            "fmla	v26.4s, bb2.4s, a2.s[2]\n"
+            "fmla	v30.4s, bb2.4s, a3.s[2]\n"
+
+            "fmla	v19.4s, bb3.4s, a0.s[2]\n"
+            "fmla	v23.4s, bb3.4s, a1.s[2]\n"
+            ASM_PREFETCH("[%[c_ptr3], #0x40]")
+            "fmla	v27.4s, bb3.4s, a2.s[2]\n"
+            "fmla	v31.4s, bb3.4s, a3.s[2]\n"
+
+            // Unroll 3
+            "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+            "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+            "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+            "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+            "cbnz	%w[odds], 7f\n"
+
+            "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+            "str	q16, [%[c_ptr0]]\n"
+            "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+            "str	q17, [%[c_ptr0], #16]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+            "str	q18, [%[c_ptr0], #32]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+            "str	q19, [%[c_ptr0], #48]\n"
+
+            "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+            "str	q20, [%[c_ptr1]]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+            "str	q21, [%[c_ptr1], #16]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+            "str	q22, [%[c_ptr1], #32]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+            "str	q23, [%[c_ptr1], #48]\n"
+
+            "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+            "str	q24, [%[c_ptr2]]\n"
+            "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+            "str	q25, [%[c_ptr2], #16]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+            "str	q26, [%[c_ptr2], #32]\n"
+            "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+            "str	q27, [%[c_ptr2], #48]\n"
+            "b	3f\n"
+
+            // "Odd ones" - lead in from even
+            "6:\n"
+            "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+            "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+            "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+            "subs	%w[odds], %w[odds], #1\n"
+            "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+            "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+            "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+            "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+            "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+            "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+            "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+            "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
+            "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
+
+            "fmla	v16.4s, bb0.4s, a0.4s\n"
+            "beq	9f\n"
+            "b	8f\n"
+
+            // "Odd ones" - lead in from odd
+            "7:\n"
+            "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+            "subs	%w[odds], %w[odds], #1\n"
+            "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+            "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+            "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+            "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+            "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+            "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+
+            "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+            "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+            "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+            "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+            "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
+            "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+
+            "fmla	v16.4s, bb0.4s, a0.4s\n"
+            "beq	9f\n"
+
+            // "Odd ones" - loop
+            "8:\n"
+            "fmla	v17.4s, bb1.4s, a0.4s\n"
+            "ld1r	{a2.4s}, [%[a_ptr2]], #4\n"
+            "fmla	v18.4s, bb2.4s, a0.4s\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v19.4s, bb3.4s, a0.4s\n"
+            "ld1r	{a3.4s}, [%[a_ptr3]], #4\n"
+
+            "fmla	v20.4s, bb0.4s, a1.4s\n"
+            "subs	%w[odds], %w[odds], #1\n"
+            "fmla	v21.4s, bb1.4s, a1.4s\n"
+            "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+            "fmla	v22.4s, bb2.4s, a1.4s\n"
+            "fmla	v23.4s, bb3.4s, a1.4s\n"
+            "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
+
+            "fmla	v24.4s, bb0.4s, a2.4s\n"
+            "fmla	v28.4s, bb0.4s, a3.4s\n"
+            "ldr	b0q, [%[b_ptr]]\n"
+            "fmla	v25.4s, bb1.4s, a2.4s\n"
+            "fmla	v29.4s, bb1.4s, a3.4s\n"
+            "ldr	b1q, [%[b_ptr], #16]\n"
+
+            "fmla	v26.4s, bb2.4s, a2.4s\n"
+            "fmla	v30.4s, bb2.4s, a3.4s\n"
+            "ldr	b2q, [%[b_ptr], #32]\n"
+            "fmla	v27.4s, bb3.4s, a2.4s\n"
+            "fmla	v31.4s, bb3.4s, a3.4s\n"
+            "ldr	b3q, [%[b_ptr], #48]\n"
+            "fmla	v16.4s, bb0.4s, a0.4s\n"
+            "bne	8b\n"
+
+            // "Odd ones" - detached final iteration
+            "9:\n"
+            "fmla	v17.4s, bb1.4s, a0.4s\n"
+            "ld1r	{a2.4s}, [%[a_ptr2]], #4\n"
+            "fmla	v18.4s, bb2.4s, a0.4s\n"
+            "add	%[b_ptr], %[b_ptr], #64\n"
+            "fmla	v19.4s, bb3.4s, a0.4s\n"
+            "ld1r	{a3.4s}, [%[a_ptr3]], #4\n"
+
+            "fmla	v20.4s, bb0.4s, a1.4s\n"
+            "str	q16, [%[c_ptr0]]\n"
+            "fmla	v21.4s, bb1.4s, a1.4s\n"
+            "str	q17, [%[c_ptr0], #16]\n"
+            "fmla	v22.4s, bb2.4s, a1.4s\n"
+            "str	q18, [%[c_ptr0], #32]\n"
+            "fmla	v23.4s, bb3.4s, a1.4s\n"
+            "str	q19, [%[c_ptr0], #48]\n"
+
+            "fmla	v24.4s, bb0.4s, a2.4s\n"
+            "str	q20, [%[c_ptr1]]\n"
+            "fmla	v25.4s, bb1.4s, a2.4s\n"
+            "str	q21, [%[c_ptr1], #16]\n"
+            "fmla	v26.4s, bb2.4s, a2.4s\n"
+            "str	q22, [%[c_ptr1], #32]\n"
+            "fmla	v27.4s, bb3.4s, a2.4s\n"
+            "str	q23, [%[c_ptr1], #48]\n"
+
+            "fmla	v28.4s, bb0.4s, a3.4s\n"
+            "str	q24, [%[c_ptr2]]\n"
+            "fmla	v29.4s, bb1.4s, a3.4s\n"
+            "str	q25, [%[c_ptr2], #16]\n"
+            "fmla	v30.4s, bb2.4s, a3.4s\n"
+            "str	q26, [%[c_ptr2], #32]\n"
+            "fmla	v31.4s, bb3.4s, a3.4s\n"
+            "str	q27, [%[c_ptr2], #48]\n"
+
+            "3:\n"
+            "str	q28, [%[c_ptr3]]\n"
+            // Increment C pointers for next loop - this looks odd if we
+            // are using the result buffer, but it's OK as using the
+            // result buffer implies there will be no next loop.
+            "add	%[c_ptr0], %[c_ptr0], #64\n"
+            "str	q29, [%[c_ptr3], #16]\n"
+            "add	%[c_ptr1], %[c_ptr1], %[a_incr1], LSL #1\n"
+            "str	q30, [%[c_ptr3], #32]\n"
+            "add	%[c_ptr2], %[c_ptr2], %[a_incr2], LSL #1\n"
+            "str	q31, [%[c_ptr3], #48]\n"
+            "add	%[c_ptr3], %[c_ptr3], %[a_incr3], LSL #1\n"
+
+            : [a_ptr0] "+r" (a_ptr0), [a_ptr1] "+r" (a_ptr1), [a_ptr2] "+r" (a_ptr2), [a_ptr3] "+r" (a_ptr3),
+            [b_ptr] "+r" (b_ptr), [loops] "+r" (loops), [odds] "+r" (odds),
+            [c_ptr0] "+r" (c_ptr0), [c_ptr1] "+r" (c_ptr1), [c_ptr2] "+r" (c_ptr2), [c_ptr3] "+r" (c_ptr3)
+            : [oddk] "r" (oddk), [beta0] "r" (beta0), [betaptr] "r" (&beta),
+            [a_incr1] "r" (a_incr1), [a_incr2] "r" (a_incr2), [a_incr3] "r" (a_incr3)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                    "cc", "memory"
+            );
+
+            /* Copy results from result buffer if needed. */
+            if (use_result_buf) {
+                for (unsigned int row=0; row<active_rows; row++) {
+                    for (unsigned int col=0; col<active_cols; col++) {
+                        C[((y + row) * ldc) + (x0 + col)] = C_buf[row * 16 + col];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
index 1a35965..3d2b324 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,15 +46,15 @@
     typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width() {
+    static unsigned int out_width() {
         return 16;
     }
 
-    static int out_height() {
+    static unsigned int out_height() {
         return 4;
     }
 
-    static int k_unroll() {
+    static unsigned int k_unroll() {
         return 1;
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index a73bc76..f5b4f4a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,13 +46,26 @@
      * terms of this standard arrangement, so if the A matrix is in fact the
      * B matrix from a GEMM call, the sense of the transpose needs to be
      * reversed.  */
-    static const int A_interleave = 32;
-    static const int A_block = 1;
-    static const bool A_transpose = false;
+    static constexpr unsigned int A_interleave() {
+        return 32;
+    }
+
+    static constexpr unsigned int A_block() {
+        return 1;
+    }
+
+    static constexpr bool A_transpose() {
+        return false;
+    }
 
     /* Kernel blocking parameters */
-    static const int out_width = 32;
-    static const int k_unroll = 1;
+    static constexpr unsigned int out_width() {
+        return 32;
+    }
+
+    static constexpr unsigned int k_unroll() {
+        return 1;
+    }
 
     kern_type kernel = a64_sgemv_pretransposed;
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 18c5c3a..cbaa0cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,13 @@
     typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
 
     /* Kernel blocking parameters */
-    static const int out_width = 96;
-    static const int k_unroll = 1;
+    static unsigned int out_width() {
+        return 96;
+    }
+
+    static unsigned int k_unroll() {
+        return 1;
+    }
 
     kern_type kernel=a64_sgemv_trans;
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000..76f452d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_4VLx4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+
+    hybrid_fp32_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..b8aa825
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp

@@ -0,0 +1,2005 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(float);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "2:\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "2:\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "fmul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "2:\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "mov z28.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "fmul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "fmul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "fmul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "2:\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z28.s, z12.s, z7.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z29.s, z13.s, z7.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z30.s, z14.s, z7.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "fmla z31.s, z15.s, z7.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z28.s, z12.s, z7.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z29.s, z13.s, z7.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z30.s, z14.s, z7.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "fmla z31.s, z15.s, z7.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "b.eq 6f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "b.eq 6f\n"
+                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 3fd738e..2ca4ce2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,17 @@
     typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width()
+    static unsigned int out_width()
     {
-        return svcnth() * 3;
+        return get_vector_length<__fp16>() * 3;
     }
 
-    static int out_height()
+    static unsigned int out_height()
     {
         return 8;
     }
 
-    static int k_unroll()
+    static unsigned int k_unroll()
     {
         return 1;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index 92ec888..517895c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,24 +48,24 @@
                 "mov z8.h, #0\n"
                 "ptrue p0.h\n"
                 "mov z9.h, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
                 "mov z10.h, #0\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
                 "mov z11.h, #0\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
                 "mov z12.h, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
                 "mov z13.h, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
                 "mov z14.h, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
                 "mov z15.h, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
                 "mov z16.h, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
                 "mov z17.h, #0\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
                 "mov z18.h, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
                 "mov z19.h, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
                 "mov z20.h, #0\n"
                 "mov z21.h, #0\n"
                 "mov z22.h, #0\n"
@@ -199,37 +199,31 @@
                 "fmla z30.h, z7.h, z1.h[6]\n"
                 "fmla z31.h, z7.h, z1.h[7]\n"
                 "fmla z8.h, z2.h, z0.h[0]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
                 "fmla z9.h, z2.h, z0.h[1]\n"
                 "fmla z10.h, z2.h, z0.h[2]\n"
                 "fmla z11.h, z2.h, z0.h[3]\n"
                 "fmla z12.h, z2.h, z0.h[4]\n"
+                "st1h z8.h, p0, [%[c_ptr]]\n"
                 "fmla z13.h, z2.h, z0.h[5]\n"
                 "fmla z14.h, z2.h, z0.h[6]\n"
                 "fmla z15.h, z2.h, z0.h[7]\n"
                 "fmla z16.h, z3.h, z0.h[0]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z17.h, z3.h, z0.h[1]\n"
                 "fmla z18.h, z3.h, z0.h[2]\n"
                 "fmla z19.h, z3.h, z0.h[3]\n"
                 "fmla z20.h, z3.h, z0.h[4]\n"
+                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z21.h, z3.h, z0.h[5]\n"
                 "fmla z22.h, z3.h, z0.h[6]\n"
                 "fmla z23.h, z3.h, z0.h[7]\n"
                 "fmla z24.h, z4.h, z0.h[0]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z25.h, z4.h, z0.h[1]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z26.h, z4.h, z0.h[2]\n"
-                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
                 "fmla z27.h, z4.h, z0.h[3]\n"
-                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
                 "fmla z28.h, z4.h, z0.h[4]\n"
-                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z29.h, z4.h, z0.h[5]\n"
-                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
                 "fmla z30.h, z4.h, z0.h[6]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "fmla z31.h, z4.h, z0.h[7]\n"
                 "b 4f\n"
                 "3:\n"
@@ -260,39 +254,39 @@
                 "fmla z30.h, z4.h, z0.h[6]\n"
                 "fmla z31.h, z4.h, z0.h[7]\n"
                 "fmla z8.h, z5.h, z1.h[0]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
                 "fmla z9.h, z5.h, z1.h[1]\n"
                 "fmla z10.h, z5.h, z1.h[2]\n"
                 "fmla z11.h, z5.h, z1.h[3]\n"
                 "fmla z12.h, z5.h, z1.h[4]\n"
+                "st1h z8.h, p0, [%[c_ptr]]\n"
                 "fmla z13.h, z5.h, z1.h[5]\n"
                 "fmla z14.h, z5.h, z1.h[6]\n"
                 "fmla z15.h, z5.h, z1.h[7]\n"
                 "fmla z16.h, z6.h, z1.h[0]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z17.h, z6.h, z1.h[1]\n"
                 "fmla z18.h, z6.h, z1.h[2]\n"
                 "fmla z19.h, z6.h, z1.h[3]\n"
                 "fmla z20.h, z6.h, z1.h[4]\n"
+                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z21.h, z6.h, z1.h[5]\n"
                 "fmla z22.h, z6.h, z1.h[6]\n"
                 "fmla z23.h, z6.h, z1.h[7]\n"
                 "fmla z24.h, z7.h, z1.h[0]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z25.h, z7.h, z1.h[1]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z26.h, z7.h, z1.h[2]\n"
-                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
                 "fmla z27.h, z7.h, z1.h[3]\n"
-                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
                 "fmla z28.h, z7.h, z1.h[4]\n"
-                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z29.h, z7.h, z1.h[5]\n"
-                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
                 "fmla z30.h, z7.h, z1.h[6]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "fmla z31.h, z7.h, z1.h[7]\n"
                 "4:\n"
+                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
                 "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
                 "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index b2327f3..8c1fe6d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,17 @@
     typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width()
+    static unsigned int out_width()
     {
-        return svcntw() * 3;
+        return get_vector_length<float>() * 3;
     }
 
-    static int out_height()
+    static unsigned int out_height()
     {
         return 8;
     }
 
-    static int k_unroll()
+    static unsigned int k_unroll()
     {
         return 1;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index bb08fc7..88c9840 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,22 +48,22 @@
                 "mov z8.s, #0\n"
                 "ptrue p0.s\n"
                 "mov z9.s, #0\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
                 "mov z10.s, #0\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
                 "mov z11.s, #0\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
                 "mov z12.s, #0\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
                 "mov z13.s, #0\n"
-                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
                 "mov z14.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
                 "mov z15.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
                 "mov z16.s, #0\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
                 "mov z17.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 "mov z18.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
                 "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
@@ -204,37 +204,31 @@
                 "fmla z31.s, z6.s, z3.s[3]\n"
                 "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 "fmla z8.s, z4.s, z0.s[0]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "fmla z9.s, z4.s, z0.s[1]\n"
                 "fmla z10.s, z4.s, z0.s[2]\n"
                 "fmla z11.s, z4.s, z0.s[3]\n"
                 "fmla z20.s, z4.s, z1.s[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "fmla z21.s, z4.s, z1.s[1]\n"
                 "fmla z22.s, z4.s, z1.s[2]\n"
                 "fmla z23.s, z4.s, z1.s[3]\n"
                 "fmla z12.s, z5.s, z0.s[0]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z13.s, z5.s, z0.s[1]\n"
                 "fmla z14.s, z5.s, z0.s[2]\n"
                 "fmla z15.s, z5.s, z0.s[3]\n"
                 "fmla z24.s, z5.s, z1.s[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z25.s, z5.s, z1.s[1]\n"
                 "fmla z26.s, z5.s, z1.s[2]\n"
                 "fmla z27.s, z5.s, z1.s[3]\n"
                 "fmla z16.s, z6.s, z0.s[0]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z17.s, z6.s, z0.s[1]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z18.s, z6.s, z0.s[2]\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "fmla z19.s, z6.s, z0.s[3]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "fmla z28.s, z6.s, z1.s[0]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z29.s, z6.s, z1.s[1]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
                 "fmla z30.s, z6.s, z1.s[2]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "fmla z31.s, z6.s, z1.s[3]\n"
                 "b 4f\n"
                 "3:\n"
@@ -269,39 +263,39 @@
                 "fmla z31.s, z6.s, z1.s[3]\n"
                 "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 "fmla z8.s, z4.s, z2.s[0]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "fmla z9.s, z4.s, z2.s[1]\n"
                 "fmla z10.s, z4.s, z2.s[2]\n"
                 "fmla z11.s, z4.s, z2.s[3]\n"
                 "fmla z20.s, z4.s, z3.s[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "fmla z21.s, z4.s, z3.s[1]\n"
                 "fmla z22.s, z4.s, z3.s[2]\n"
                 "fmla z23.s, z4.s, z3.s[3]\n"
                 "fmla z12.s, z5.s, z2.s[0]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z13.s, z5.s, z2.s[1]\n"
                 "fmla z14.s, z5.s, z2.s[2]\n"
                 "fmla z15.s, z5.s, z2.s[3]\n"
                 "fmla z24.s, z5.s, z3.s[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "fmla z25.s, z5.s, z3.s[1]\n"
                 "fmla z26.s, z5.s, z3.s[2]\n"
                 "fmla z27.s, z5.s, z3.s[3]\n"
                 "fmla z16.s, z6.s, z2.s[0]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z17.s, z6.s, z2.s[1]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z18.s, z6.s, z2.s[2]\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "fmla z19.s, z6.s, z2.s[3]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "fmla z28.s, z6.s, z3.s[0]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "fmla z29.s, z6.s, z3.s[1]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
                 "fmla z30.s, z6.s, z3.s[2]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "fmla z31.s, z6.s, z3.s[3]\n"
                 "4:\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
                 "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
                 "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index 91aa567..cbb2138 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,17 @@
     typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width()
+    static unsigned int out_width()
     {
-        return svcntw() * 3;
+        return get_vector_length<int32_t>() * 3;
     }
 
-    static int out_height()
+    static unsigned int out_height()
     {
         return 8;
     }
 
-    static int k_unroll()
+    static unsigned int k_unroll()
     {
         return 4;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 2e994a1..d679c21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,22 +49,22 @@
                 "mov z8.s, #0\n"
                 "ptrue p0.b\n"
                 "mov z9.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
                 "mov z10.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 "mov z11.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
                 "mov z12.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
                 "mov z13.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 "mov z14.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
                 "mov z15.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
                 "mov z16.s, #0\n"
+                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
                 "mov z17.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 "mov z18.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
                 "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
@@ -205,37 +205,31 @@
                 "sdot z31.s, z6.b, z3.b[3]\n"
                 "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 "sdot z8.s, z4.b, z0.b[0]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "sdot z9.s, z4.b, z0.b[1]\n"
                 "sdot z10.s, z4.b, z0.b[2]\n"
                 "sdot z11.s, z4.b, z0.b[3]\n"
                 "sdot z20.s, z4.b, z1.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "sdot z21.s, z4.b, z1.b[1]\n"
                 "sdot z22.s, z4.b, z1.b[2]\n"
                 "sdot z23.s, z4.b, z1.b[3]\n"
                 "sdot z12.s, z5.b, z0.b[0]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "sdot z13.s, z5.b, z0.b[1]\n"
                 "sdot z14.s, z5.b, z0.b[2]\n"
                 "sdot z15.s, z5.b, z0.b[3]\n"
                 "sdot z24.s, z5.b, z1.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "sdot z25.s, z5.b, z1.b[1]\n"
                 "sdot z26.s, z5.b, z1.b[2]\n"
                 "sdot z27.s, z5.b, z1.b[3]\n"
                 "sdot z16.s, z6.b, z0.b[0]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "sdot z17.s, z6.b, z0.b[1]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "sdot z18.s, z6.b, z0.b[2]\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "sdot z19.s, z6.b, z0.b[3]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "sdot z28.s, z6.b, z1.b[0]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "sdot z29.s, z6.b, z1.b[1]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
                 "sdot z30.s, z6.b, z1.b[2]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "sdot z31.s, z6.b, z1.b[3]\n"
                 "b 4f\n"
                 "3:\n"
@@ -270,39 +264,39 @@
                 "sdot z31.s, z6.b, z1.b[3]\n"
                 "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 "sdot z8.s, z4.b, z2.b[0]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "sdot z9.s, z4.b, z2.b[1]\n"
                 "sdot z10.s, z4.b, z2.b[2]\n"
                 "sdot z11.s, z4.b, z2.b[3]\n"
                 "sdot z20.s, z4.b, z3.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
                 "sdot z21.s, z4.b, z3.b[1]\n"
                 "sdot z22.s, z4.b, z3.b[2]\n"
                 "sdot z23.s, z4.b, z3.b[3]\n"
                 "sdot z12.s, z5.b, z2.b[0]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "sdot z13.s, z5.b, z2.b[1]\n"
                 "sdot z14.s, z5.b, z2.b[2]\n"
                 "sdot z15.s, z5.b, z2.b[3]\n"
                 "sdot z24.s, z5.b, z3.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
                 "sdot z25.s, z5.b, z3.b[1]\n"
                 "sdot z26.s, z5.b, z3.b[2]\n"
                 "sdot z27.s, z5.b, z3.b[3]\n"
                 "sdot z16.s, z6.b, z2.b[0]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "sdot z17.s, z6.b, z2.b[1]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "sdot z18.s, z6.b, z2.b[2]\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "sdot z19.s, z6.b, z2.b[3]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "sdot z28.s, z6.b, z3.b[0]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
                 "sdot z29.s, z6.b, z3.b[1]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
                 "sdot z30.s, z6.b, z3.b[2]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "sdot z31.s, z6.b, z3.b[3]\n"
                 "4:\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
                 "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
                 "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
                 "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index ef457e4..99c039e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,17 +41,17 @@
     typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
     /* Kernel blocking parameters */
-    static int out_width()
+    static unsigned int out_width()
     {
-        return svcntw() * 3;
+        return get_vector_length<uint32_t>() * 3;
     }
 
-    static int out_height()
+    static unsigned int out_height()
     {
         return 8;
     }
 
-    static int k_unroll()
+    static unsigned int k_unroll()
     {
         return 4;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000..d7f9f20
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class native_fp32_mla_4VLx4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_native_fp32_mla_4VLx4;
+
+    native_fp32_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..6e22566
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp

@@ -0,0 +1,2066 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(float);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + x0;
+            long ldbb = ldb * sizeof(float);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "2:\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z23.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "2:\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z25.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z26.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z27.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "fmul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "2:\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z25.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z26.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z27.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z28.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z29.s, #0\n"
+                        "mov z30.s, #0\n"
+                        "mov z31.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "fmul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "fmul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "fmul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "fmul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "fmul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "fmul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "fmul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "2:\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z28.s, z12.s, z7.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z29.s, z13.s, z7.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z30.s, z14.s, z7.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "fmla z31.s, z15.s, z7.s[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "cbz %[regs], 5f\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "fmla z16.s, z12.s, z4.s[3]\n"
+                        "fmla z20.s, z12.s, z5.s[3]\n"
+                        "fmla z24.s, z12.s, z6.s[3]\n"
+                        "fmla z28.s, z12.s, z7.s[3]\n"
+                        "fmla z17.s, z13.s, z4.s[3]\n"
+                        "fmla z21.s, z13.s, z5.s[3]\n"
+                        "fmla z25.s, z13.s, z6.s[3]\n"
+                        "fmla z29.s, z13.s, z7.s[3]\n"
+                        "fmla z18.s, z14.s, z4.s[3]\n"
+                        "fmla z22.s, z14.s, z5.s[3]\n"
+                        "fmla z26.s, z14.s, z6.s[3]\n"
+                        "fmla z30.s, z14.s, z7.s[3]\n"
+                        "fmla z19.s, z15.s, z4.s[3]\n"
+                        "fmla z23.s, z15.s, z5.s[3]\n"
+                        "fmla z27.s, z15.s, z6.s[3]\n"
+                        "fmla z31.s, z15.s, z7.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "b 6f\n"
+                        "5:\n"
+                        "fmla z16.s, z8.s, z0.s[0]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.s, z8.s, z1.s[0]\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.s, z8.s, z2.s[0]\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        "fmla z28.s, z8.s, z3.s[0]\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "fmla z17.s, z9.s, z0.s[0]\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        "fmla z21.s, z9.s, z1.s[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.s, z9.s, z2.s[0]\n"
+                        "fmla z29.s, z9.s, z3.s[0]\n"
+                        "fmla z18.s, z10.s, z0.s[0]\n"
+                        "fmla z22.s, z10.s, z1.s[0]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z26.s, z10.s, z2.s[0]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z30.s, z10.s, z3.s[0]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z11.s, z0.s[0]\n"
+                        "fmla z23.s, z11.s, z1.s[0]\n"
+                        "fmla z27.s, z11.s, z2.s[0]\n"
+                        "fmla z31.s, z11.s, z3.s[0]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z0.s[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.s, z12.s, z1.s[1]\n"
+                        "fmla z24.s, z12.s, z2.s[1]\n"
+                        "fmla z28.s, z12.s, z3.s[1]\n"
+                        "fmla z17.s, z13.s, z0.s[1]\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.s, z13.s, z1.s[1]\n"
+                        "fmla z25.s, z13.s, z2.s[1]\n"
+                        "fmla z29.s, z13.s, z3.s[1]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.s, z14.s, z0.s[1]\n"
+                        "fmla z22.s, z14.s, z1.s[1]\n"
+                        "fmla z26.s, z14.s, z2.s[1]\n"
+                        "fmla z30.s, z14.s, z3.s[1]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.s, z15.s, z0.s[1]\n"
+                        "fmla z23.s, z15.s, z1.s[1]\n"
+                        "fmla z27.s, z15.s, z2.s[1]\n"
+                        "fmla z31.s, z15.s, z3.s[1]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z0.s[2]\n"
+                        "fmla z20.s, z8.s, z1.s[2]\n"
+                        "fmla z24.s, z8.s, z2.s[2]\n"
+                        "fmla z28.s, z8.s, z3.s[2]\n"
+                        "fmla z17.s, z9.s, z0.s[2]\n"
+                        "fmla z21.s, z9.s, z1.s[2]\n"
+                        "fmla z25.s, z9.s, z2.s[2]\n"
+                        "fmla z29.s, z9.s, z3.s[2]\n"
+                        "fmla z18.s, z10.s, z0.s[2]\n"
+                        "fmla z22.s, z10.s, z1.s[2]\n"
+                        "fmla z26.s, z10.s, z2.s[2]\n"
+                        "fmla z30.s, z10.s, z3.s[2]\n"
+                        "fmla z19.s, z11.s, z0.s[2]\n"
+                        "fmla z23.s, z11.s, z1.s[2]\n"
+                        "fmla z27.s, z11.s, z2.s[2]\n"
+                        "fmla z31.s, z11.s, z3.s[2]\n"
+                        "fmla z16.s, z12.s, z0.s[3]\n"
+                        "fmla z20.s, z12.s, z1.s[3]\n"
+                        "fmla z24.s, z12.s, z2.s[3]\n"
+                        "fmla z28.s, z12.s, z3.s[3]\n"
+                        "fmla z17.s, z13.s, z0.s[3]\n"
+                        "fmla z21.s, z13.s, z1.s[3]\n"
+                        "fmla z25.s, z13.s, z2.s[3]\n"
+                        "fmla z29.s, z13.s, z3.s[3]\n"
+                        "fmla z18.s, z14.s, z0.s[3]\n"
+                        "fmla z22.s, z14.s, z1.s[3]\n"
+                        "fmla z26.s, z14.s, z2.s[3]\n"
+                        "fmla z30.s, z14.s, z3.s[3]\n"
+                        "fmla z19.s, z15.s, z0.s[3]\n"
+                        "fmla z23.s, z15.s, z1.s[3]\n"
+                        "fmla z27.s, z15.s, z2.s[3]\n"
+                        "fmla z31.s, z15.s, z3.s[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[0]\n"
+                        "fmla z20.s, z8.s, z5.s[0]\n"
+                        "fmla z24.s, z8.s, z6.s[0]\n"
+                        "fmla z28.s, z8.s, z7.s[0]\n"
+                        "fmla z17.s, z9.s, z4.s[0]\n"
+                        "fmla z21.s, z9.s, z5.s[0]\n"
+                        "fmla z25.s, z9.s, z6.s[0]\n"
+                        "fmla z29.s, z9.s, z7.s[0]\n"
+                        "fmla z18.s, z10.s, z4.s[0]\n"
+                        "fmla z22.s, z10.s, z5.s[0]\n"
+                        "fmla z26.s, z10.s, z6.s[0]\n"
+                        "fmla z30.s, z10.s, z7.s[0]\n"
+                        "fmla z19.s, z11.s, z4.s[0]\n"
+                        "fmla z23.s, z11.s, z5.s[0]\n"
+                        "fmla z27.s, z11.s, z6.s[0]\n"
+                        "fmla z31.s, z11.s, z7.s[0]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z12.s, z4.s[1]\n"
+                        "fmla z20.s, z12.s, z5.s[1]\n"
+                        "fmla z24.s, z12.s, z6.s[1]\n"
+                        "fmla z28.s, z12.s, z7.s[1]\n"
+                        "fmla z17.s, z13.s, z4.s[1]\n"
+                        "fmla z21.s, z13.s, z5.s[1]\n"
+                        "fmla z25.s, z13.s, z6.s[1]\n"
+                        "fmla z29.s, z13.s, z7.s[1]\n"
+                        "fmla z18.s, z14.s, z4.s[1]\n"
+                        "fmla z22.s, z14.s, z5.s[1]\n"
+                        "fmla z26.s, z14.s, z6.s[1]\n"
+                        "fmla z30.s, z14.s, z7.s[1]\n"
+                        "fmla z19.s, z15.s, z4.s[1]\n"
+                        "fmla z23.s, z15.s, z5.s[1]\n"
+                        "fmla z27.s, z15.s, z6.s[1]\n"
+                        "fmla z31.s, z15.s, z7.s[1]\n"
+                        "b.eq 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.s, z8.s, z4.s[2]\n"
+                        "fmla z20.s, z8.s, z5.s[2]\n"
+                        "fmla z24.s, z8.s, z6.s[2]\n"
+                        "fmla z28.s, z8.s, z7.s[2]\n"
+                        "fmla z17.s, z9.s, z4.s[2]\n"
+                        "fmla z21.s, z9.s, z5.s[2]\n"
+                        "fmla z25.s, z9.s, z6.s[2]\n"
+                        "fmla z29.s, z9.s, z7.s[2]\n"
+                        "fmla z18.s, z10.s, z4.s[2]\n"
+                        "fmla z22.s, z10.s, z5.s[2]\n"
+                        "fmla z26.s, z10.s, z6.s[2]\n"
+                        "fmla z30.s, z10.s, z7.s[2]\n"
+                        "fmla z19.s, z11.s, z4.s[2]\n"
+                        "fmla z23.s, z11.s, z5.s[2]\n"
+                        "fmla z27.s, z11.s, z6.s[2]\n"
+                        "fmla z31.s, z11.s, z7.s[2]\n"
+                        "6:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000..8b98358
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+class native_s8s32_dot_4VLx4
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_native_s8s32_dot_4VLx4;
+
+    native_s8s32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..9c02d95
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp

@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0);
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
+
+    for (int y=0; y<M; y+=4) {
+        const int8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(int8_t);
+
+        int32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(int32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+            const int32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            long odds = odds_count;
+            const int8_t *a_ptr0 = a_ptr0_base;
+            const int8_t *b_ptr0 = B + x0;
+            const int8_t *b_ptr1 = b_ptr0 + ldb;
+            const int8_t *b_ptr2 = b_ptr1 + ldb;
+            const int8_t *b_ptr3 = b_ptr2 + ldb;
+            long ldbb = ldb * sizeof(int8_t) * 4;
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "2:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z21.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z22.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "2:\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "2:\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z23.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z25.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z26.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "mov z28.s, #0\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z29.s, #0\n"
+                        "mov z30.s, #0\n"
+                        "mov z31.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "2:\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000..bcbd3d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+class native_u8u32_dot_4VLx4
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_native_u8u32_dot_4VLx4;
+
+    native_u8u32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..7d89948
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp

@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0u);
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
+
+    for (int y=0; y<M; y+=4) {
+        const uint8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(uint8_t);
+
+        uint32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+            const uint32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            long odds = odds_count;
+            const uint8_t *a_ptr0 = a_ptr0_base;
+            const uint8_t *b_ptr0 = B + x0;
+            const uint8_t *b_ptr1 = b_ptr0 + ldb;
+            const uint8_t *b_ptr2 = b_ptr1 + ldb;
+            const uint8_t *b_ptr3 = b_ptr2 + ldb;
+            long ldbb = ldb * sizeof(uint8_t) * 4;
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "2:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z21.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z22.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "2:\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "2:\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "whilelt p4.b, %[temp], %[width]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "mov z23.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z25.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z26.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "mov z28.s, #0\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z29.s, #0\n"
+                        "mov z30.s, #0\n"
+                        "mov z31.s, #0\n"
+                        "b 2f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "2:\n"
+                        "cbz %[loops], 3f\n"
+                        "4:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 4b\n"
+                        "3:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[regs], 5f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 6f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 7f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 8f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 10f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 11f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "11:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 12f\n"
+                        "10:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "12:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "b 9f\n"
+                        "8:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 13f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 14f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "14:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 15f\n"
+                        "13:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "15:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "b 9f\n"
+                        "7:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 16f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 17f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "17:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 18f\n"
+                        "16:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "18:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "b 9f\n"
+                        "6:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 19f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 20f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "20:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 21f\n"
+                        "19:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "21:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "b 9f\n"
+                        "5:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 22f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 23f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 24f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 25f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 26f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "26:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 27f\n"
+                        "25:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "27:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "b 9f\n"
+                        "24:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 28f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 29f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "29:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 30f\n"
+                        "28:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "30:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "b 9f\n"
+                        "23:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 31f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 32f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "32:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "b 33f\n"
+                        "31:\n"
+                        "mov z13.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z14.b, #0\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "33:\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "mov z12.b, #0\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "zip1 z14.b, z15.b, z8.b\n"
+                        "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "b 9f\n"
+                        "22:\n"
+                        "cbz %[odds], 9f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 34f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 35f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "35:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "b 36f\n"
+                        "34:\n"
+                        "mov z9.b, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z10.b, #0\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "36:\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "mov z8.b, #0\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "9:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000..06622d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class smallK_fp32_mla_1VLx4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 1;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_smallK_fp32_mla_1VLx4;
+
+    smallK_fp32_mla_1VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000..e2cc1d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp

@@ -0,0 +1,4264 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+
+    const long loops_count = M / 4;
+    const long oddrow_count = M % 4;
+    const long ldab = lda * sizeof(float);
+    const long ldcb = ldc * sizeof(float);
+    const long odd_depth = K % 4;
+    const float *betaptr = &beta;
+    long ldbb = ldb * sizeof(float);
+
+    for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+        const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+        long loops = loops_count;
+        long oddrows = oddrow_count;
+        long temp = 0;
+        const float *b_ptr0 = B + x0;
+
+        const float *a_ptr0 = A;
+
+        float *c_ptr0 = C + x0;
+
+        switch(K) {
+            case 1:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 2:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 3:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 4:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 5:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 6:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 7:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 8:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 9:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 10:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 11:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 12:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 13:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 14:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 15:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 16:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 17:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 18:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 19:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 20:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 21:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 22:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 23:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "fmla z28.s, z26.s, z0.s[2]\n"
+                    "fmla z29.s, z26.s, z1.s[2]\n"
+                    "fmla z30.s, z26.s, z2.s[2]\n"
+                    "fmla z31.s, z26.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "fmla z28.s, z26.s, z1.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            default:
+            case 24:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                    "ld1w z27.s, p0/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "fmla z28.s, z26.s, z0.s[2]\n"
+                    "fmla z29.s, z26.s, z1.s[2]\n"
+                    "fmla z30.s, z26.s, z2.s[2]\n"
+                    "fmla z31.s, z26.s, z3.s[2]\n"
+                    "fmla z28.s, z27.s, z0.s[3]\n"
+                    "fmla z29.s, z27.s, z1.s[3]\n"
+                    "fmla z30.s, z27.s, z2.s[3]\n"
+                    "fmla z31.s, z27.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "fmla z28.s, z26.s, z1.s[2]\n"
+                    "fmla z28.s, z27.s, z1.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000..022efdf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class smallK_hybrid_fp32_mla_1VLx4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 1;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 1, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4;
+
+    smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000..3e7e713
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp

@@ -0,0 +1,4004 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+
+    const long loops_count = M / 4;
+    const long oddrow_count = M % 4;
+    const long ldab = lda * sizeof(float);
+    const long ldcb = ldc * sizeof(float);
+    const int K_stride = K;
+    const long odd_depth = K % 4;
+    const float *betaptr = &beta;
+
+    for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+        const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+        long loops = loops_count;
+        long oddrows = oddrow_count;
+        long temp = 0;
+        const float *b_ptr0 = B + (K_stride * x0);
+
+        const float *a_ptr0 = A;
+
+        float *c_ptr0 = C + x0;
+
+        switch(K) {
+            case 1:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 2:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 3:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 4:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 5:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 6:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 7:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 8:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 9:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 10:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 11:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 12:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 13:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 14:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 15:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 16:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 17:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 18:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 19:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 20:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 21:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 22:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 23:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "fmla z28.s, z26.s, z0.s[2]\n"
+                    "fmla z29.s, z26.s, z1.s[2]\n"
+                    "fmla z30.s, z26.s, z2.s[2]\n"
+                    "fmla z31.s, z26.s, z3.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "fmla z28.s, z26.s, z1.s[2]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            default:
+            case 24:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "whilelt p6.s, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[width]\n"
+                    "ptrue p7.s\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                    "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                    "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                    "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                    "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                    "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                    "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                    "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                    "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                    "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+                    "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "cbz %[loops], 1f\n"
+                    "2:\n"
+                    "cbz %[beta0], 3f\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "b 4f\n"
+                    "3:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "ld1w z29.s, p0/z, [c_ptr1]\n"
+                    "ld1w z30.s, p0/z, [c_ptr2]\n"
+                    "ld1w z31.s, p0/z, [c_ptr3]\n"
+                    "4:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z29.s, z4.s, z1.s[0]\n"
+                    "fmla z30.s, z4.s, z2.s[0]\n"
+                    "fmla z31.s, z4.s, z3.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z29.s, z5.s, z1.s[1]\n"
+                    "fmla z30.s, z5.s, z2.s[1]\n"
+                    "fmla z31.s, z5.s, z3.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z29.s, z6.s, z1.s[2]\n"
+                    "fmla z30.s, z6.s, z2.s[2]\n"
+                    "fmla z31.s, z6.s, z3.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "fmla z29.s, z7.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+                    "fmla z30.s, z7.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+                    "fmla z31.s, z7.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+                    "fmla z28.s, z8.s, z0.s[0]\n"
+                    "fmla z29.s, z8.s, z1.s[0]\n"
+                    "fmla z30.s, z8.s, z2.s[0]\n"
+                    "fmla z31.s, z8.s, z3.s[0]\n"
+                    "fmla z28.s, z9.s, z0.s[1]\n"
+                    "fmla z29.s, z9.s, z1.s[1]\n"
+                    "fmla z30.s, z9.s, z2.s[1]\n"
+                    "fmla z31.s, z9.s, z3.s[1]\n"
+                    "fmla z28.s, z10.s, z0.s[2]\n"
+                    "fmla z29.s, z10.s, z1.s[2]\n"
+                    "fmla z30.s, z10.s, z2.s[2]\n"
+                    "fmla z31.s, z10.s, z3.s[2]\n"
+                    "fmla z28.s, z11.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "fmla z29.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+                    "fmla z30.s, z11.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+                    "fmla z31.s, z11.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+                    "fmla z28.s, z12.s, z0.s[0]\n"
+                    "fmla z29.s, z12.s, z1.s[0]\n"
+                    "fmla z30.s, z12.s, z2.s[0]\n"
+                    "fmla z31.s, z12.s, z3.s[0]\n"
+                    "fmla z28.s, z13.s, z0.s[1]\n"
+                    "fmla z29.s, z13.s, z1.s[1]\n"
+                    "fmla z30.s, z13.s, z2.s[1]\n"
+                    "fmla z31.s, z13.s, z3.s[1]\n"
+                    "fmla z28.s, z14.s, z0.s[2]\n"
+                    "fmla z29.s, z14.s, z1.s[2]\n"
+                    "fmla z30.s, z14.s, z2.s[2]\n"
+                    "fmla z31.s, z14.s, z3.s[2]\n"
+                    "fmla z28.s, z15.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z29.s, z15.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+                    "fmla z30.s, z15.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+                    "fmla z31.s, z15.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+                    "fmla z28.s, z16.s, z0.s[0]\n"
+                    "fmla z29.s, z16.s, z1.s[0]\n"
+                    "fmla z30.s, z16.s, z2.s[0]\n"
+                    "fmla z31.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z0.s[1]\n"
+                    "fmla z29.s, z17.s, z1.s[1]\n"
+                    "fmla z30.s, z17.s, z2.s[1]\n"
+                    "fmla z31.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z0.s[2]\n"
+                    "fmla z29.s, z18.s, z1.s[2]\n"
+                    "fmla z30.s, z18.s, z2.s[2]\n"
+                    "fmla z31.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z29.s, z19.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+                    "fmla z30.s, z19.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+                    "fmla z31.s, z19.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z29.s, z20.s, z1.s[0]\n"
+                    "fmla z30.s, z20.s, z2.s[0]\n"
+                    "fmla z31.s, z20.s, z3.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z29.s, z21.s, z1.s[1]\n"
+                    "fmla z30.s, z21.s, z2.s[1]\n"
+                    "fmla z31.s, z21.s, z3.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z29.s, z22.s, z1.s[2]\n"
+                    "fmla z30.s, z22.s, z2.s[2]\n"
+                    "fmla z31.s, z22.s, z3.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+                    "fmla z29.s, z23.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+                    "fmla z30.s, z23.s, z2.s[3]\n"
+                    "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+                    "fmla z31.s, z23.s, z3.s[3]\n"
+                    "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+                    "fmla z28.s, z24.s, z0.s[0]\n"
+                    "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+                    "fmla z29.s, z24.s, z1.s[0]\n"
+                    "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+                    "fmla z30.s, z24.s, z2.s[0]\n"
+                    "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+                    "fmla z31.s, z24.s, z3.s[0]\n"
+                    "fmla z28.s, z25.s, z0.s[1]\n"
+                    "fmla z29.s, z25.s, z1.s[1]\n"
+                    "fmla z30.s, z25.s, z2.s[1]\n"
+                    "fmla z31.s, z25.s, z3.s[1]\n"
+                    "fmla z28.s, z26.s, z0.s[2]\n"
+                    "fmla z29.s, z26.s, z1.s[2]\n"
+                    "fmla z30.s, z26.s, z2.s[2]\n"
+                    "fmla z31.s, z26.s, z3.s[2]\n"
+                    "fmla z28.s, z27.s, z0.s[3]\n"
+                    "fmla z29.s, z27.s, z1.s[3]\n"
+                    "fmla z30.s, z27.s, z2.s[3]\n"
+                    "fmla z31.s, z27.s, z3.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+                    "st1w z29.s, p0, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+                    "st1w z30.s, p0, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+                    "st1w z31.s, p0, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+                    "b.ne 2b\n"
+                    "1:\n"
+                    "cbz %[oddrows], 5f\n"
+                    "6:\n"
+                    "cbz %[beta0], 7f\n"
+                    "mov z28.s, #0\n"
+                    "b 8f\n"
+                    "7:\n"
+                    "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+                    "8:\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                    "subs %[oddrows], %[oddrows], #0x1\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+                    "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+                    "fmla z28.s, z4.s, z0.s[0]\n"
+                    "fmla z28.s, z5.s, z0.s[1]\n"
+                    "fmla z28.s, z6.s, z0.s[2]\n"
+                    "fmla z28.s, z7.s, z0.s[3]\n"
+                    "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+                    "fmla z28.s, z8.s, z1.s[0]\n"
+                    "fmla z28.s, z9.s, z1.s[1]\n"
+                    "fmla z28.s, z10.s, z1.s[2]\n"
+                    "fmla z28.s, z11.s, z1.s[3]\n"
+                    "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+                    "fmla z28.s, z12.s, z2.s[0]\n"
+                    "fmla z28.s, z13.s, z2.s[1]\n"
+                    "fmla z28.s, z14.s, z2.s[2]\n"
+                    "fmla z28.s, z15.s, z2.s[3]\n"
+                    "fmla z28.s, z16.s, z3.s[0]\n"
+                    "fmla z28.s, z17.s, z3.s[1]\n"
+                    "fmla z28.s, z18.s, z3.s[2]\n"
+                    "fmla z28.s, z19.s, z3.s[3]\n"
+                    "fmla z28.s, z20.s, z0.s[0]\n"
+                    "fmla z28.s, z21.s, z0.s[1]\n"
+                    "fmla z28.s, z22.s, z0.s[2]\n"
+                    "fmla z28.s, z23.s, z0.s[3]\n"
+                    "fmla z28.s, z24.s, z1.s[0]\n"
+                    "fmla z28.s, z25.s, z1.s[1]\n"
+                    "fmla z28.s, z26.s, z1.s[2]\n"
+                    "fmla z28.s, z27.s, z1.s[3]\n"
+                    "st1w z28.s, p0, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+                    "b.ne 6b\n"
+                    "5:\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
new file mode 100644
index 0000000..fcdca59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp

@@ -0,0 +1,1660 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template<>
+inline void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+
+    for (int y=y0; y<ymax; y+=8) {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        const int height = ymax - y;
+
+        for (int i=x0; i<xmax; i+=12) {
+            if (beta==0.0f)
+            {
+                switch(height) {
+                case 1:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x10]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr0], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 2:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x10]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x10]\n"
+                                "ldr q7, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 3:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x10]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr0], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x40]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr1], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr2], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 4:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]);
+                                    outptr3++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q4, [%[inptr], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x10]\n"
+                                "ldr q7, [%[inptr], #0xa0]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 5:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]);
+                                    outptr4++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q5, [%[inptr], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr0], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr1], #0x10]\n"
+                                "ldr q7, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr2], #0x10]\n"
+                                "ldr q4, [%[inptr], #0xa0]\n"
+                                "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr3], #0x10]\n"
+                                "ldr q5, [%[inptr], #0xd0]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr4], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "ldr q5, [%[inptr], #0xb0]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x20]\n"
+                                "ldr q6, [%[inptr], #0xe0]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x20]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 6:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]);
+                                    outptr5++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q6, [%[inptr], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x10]\n"
+                                "ldr q7, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x10]\n"
+                                "ldr q5, [%[inptr], #0xa0]\n"
+                                "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x10]\n"
+                                "ldr q6, [%[inptr], #0xd0]\n"
+                                "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x10]\n"
+                                "ldr q7, [%[inptr], #0x100]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr5], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "ldr q4, [%[inptr], #0xe0]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x110]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x20]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 7:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]);
+                                    outptr5++;
+                                    *outptr6 = (alpha * inptr[xi + 72]);
+                                    outptr6++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q6, [%[inptr], #0x120]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6]]\n"
+                                "ldr q7, [%[inptr], #0x10]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr0], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr1], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr2], #0x10]\n"
+                                "ldr q6, [%[inptr], #0xa0]\n"
+                                "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr3], #0x10]\n"
+                                "ldr q7, [%[inptr], #0xd0]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr4], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x100]\n"
+                                "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr5], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x130]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr6], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "ldr q5, [%[inptr], #0xb0]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x20]\n"
+                                "ldr q6, [%[inptr], #0xe0]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x20]\n"
+                                "ldr q7, [%[inptr], #0x110]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr5], #0x20]\n"
+                                "ldr q4, [%[inptr], #0x140]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr6], #0x20]\n"
+                                "add %[outptr6], %[outptr6], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                default:
+                case 8:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]);
+                                    outptr5++;
+                                    *outptr6 = (alpha * inptr[xi + 72]);
+                                    outptr6++;
+                                    *outptr7 = (alpha * inptr[xi + 84]);
+                                    outptr7++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q4, [%[inptr]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q6, [%[inptr], #0x120]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6]]\n"
+                                "ldr q7, [%[inptr], #0x150]\n"
+                                "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7]]\n"
+                                "ldr q4, [%[inptr], #0x10]\n"
+                                "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x40]\n"
+                                "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x70]\n"
+                                "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x10]\n"
+                                "ldr q7, [%[inptr], #0xa0]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x10]\n"
+                                "ldr q4, [%[inptr], #0xd0]\n"
+                                "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x10]\n"
+                                "ldr q5, [%[inptr], #0x100]\n"
+                                "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x10]\n"
+                                "ldr q6, [%[inptr], #0x130]\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6], #0x10]\n"
+                                "ldr q7, [%[inptr], #0x160]\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7], #0x10]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "ldr q4, [%[inptr], #0xe0]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x20]\n"
+                                "ldr q5, [%[inptr], #0x110]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x20]\n"
+                                "ldr q6, [%[inptr], #0x140]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6], #0x20]\n"
+                                "ldr q7, [%[inptr], #0x170]\n"
+                                "add %[outptr6], %[outptr6], #0x30\n"
+                                "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7], #0x20]\n"
+                                "add %[outptr7], %[outptr7], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+
+                }
+            }
+            else
+            {
+                switch(height) {
+                case 1:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x10]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr0], #0x10]\n"
+                                "ldr q10, [%[outptr0], #0x20]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 2:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x10]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x10]\n"
+                                "ldr q11, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x40]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x10]\n"
+                                "ldr q8, [%[outptr0], #0x20]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 3:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x10]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr0], #0x10]\n"
+                                "ldr q8, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x40]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr1], #0x10]\n"
+                                "ldr q9, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x70]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr2], #0x10]\n"
+                                "ldr q10, [%[outptr0], #0x20]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q11, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q8, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 4:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                                    outptr3++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr3]]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q8, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x10]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x10]\n"
+                                "ldr q9, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x40]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x10]\n"
+                                "ldr q10, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x70]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x10]\n"
+                                "ldr q11, [%[outptr3], #0x10]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xa0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x10]\n"
+                                "ldr q8, [%[outptr0], #0x20]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q10, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q11, [%[outptr3], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 5:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                                    outptr4++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr3]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q8, [%[outptr4]]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q9, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x10]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr0], #0x10]\n"
+                                "ldr q10, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x40]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr1], #0x10]\n"
+                                "ldr q11, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x70]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr2], #0x10]\n"
+                                "ldr q8, [%[outptr3], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xa0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr3], #0x10]\n"
+                                "ldr q9, [%[outptr4], #0x10]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xd0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr4], #0x10]\n"
+                                "ldr q10, [%[outptr0], #0x20]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q11, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q8, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "ldr q9, [%[outptr3], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xb0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x20]\n"
+                                "ldr q10, [%[outptr4], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0xe0]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x20]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 6:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                                    outptr5++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr3]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q8, [%[outptr4]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q9, [%[outptr5]]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q10, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x10]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x10]\n"
+                                "ldr q11, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x40]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x10]\n"
+                                "ldr q8, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x70]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x10]\n"
+                                "ldr q9, [%[outptr3], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xa0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x10]\n"
+                                "ldr q10, [%[outptr4], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0xd0]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x10]\n"
+                                "ldr q11, [%[outptr5], #0x10]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x100]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr5], #0x10]\n"
+                                "ldr q8, [%[outptr0], #0x20]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q10, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q11, [%[outptr3], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "ldr q8, [%[outptr4], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xe0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x20]\n"
+                                "ldr q9, [%[outptr5], #0x20]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x110]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x20]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                case 7:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                                    outptr5++;
+                                    *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                                    outptr6++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr3]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q8, [%[outptr4]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q9, [%[outptr5]]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q10, [%[outptr6]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x120]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6]]\n"
+                                "ldr q11, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x10]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr0], #0x10]\n"
+                                "ldr q8, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x40]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr1], #0x10]\n"
+                                "ldr q9, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x70]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr2], #0x10]\n"
+                                "ldr q10, [%[outptr3], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0xa0]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr3], #0x10]\n"
+                                "ldr q11, [%[outptr4], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xd0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr4], #0x10]\n"
+                                "ldr q8, [%[outptr5], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x100]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr5], #0x10]\n"
+                                "ldr q9, [%[outptr6], #0x10]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x130]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr6], #0x10]\n"
+                                "ldr q10, [%[outptr0], #0x20]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x20]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr0], #0x20]\n"
+                                "ldr q11, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x50]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr1], #0x20]\n"
+                                "ldr q8, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x80]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr2], #0x20]\n"
+                                "ldr q9, [%[outptr3], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xb0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr3], #0x20]\n"
+                                "ldr q10, [%[outptr4], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0xe0]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr4], #0x20]\n"
+                                "ldr q11, [%[outptr5], #0x20]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x110]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr5], #0x20]\n"
+                                "ldr q8, [%[outptr6], #0x20]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x140]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr6], #0x20]\n"
+                                "add %[outptr6], %[outptr6], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+                default:
+                case 8:
+                    {
+                        if ((i+11) >= xmax)
+                        {
+                            for (int xi=0; xi<12; xi++)
+                            {
+                                if ((i+xi) < xmax)
+                                {
+                                    *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                                    outptr0++;
+                                    *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                                    outptr1++;
+                                    *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                                    outptr2++;
+                                    *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                                    outptr3++;
+                                    *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                                    outptr4++;
+                                    *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                                    outptr5++;
+                                    *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                                    outptr6++;
+                                    *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                                    outptr7++;
+                                }
+                            }
+                            inptr += 96;
+                        } else {
+                            /* Optimized routine to copy an entire block */
+                            __asm __volatile (
+                                "ldr q8, [%[outptr0]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr]]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0]]\n"
+                                "ldr q9, [%[outptr1]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x30]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1]]\n"
+                                "ldr q10, [%[outptr2]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x60]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2]]\n"
+                                "ldr q11, [%[outptr3]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x90]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3]]\n"
+                                "ldr q8, [%[outptr4]]\n"
+                                "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xc0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4]]\n"
+                                "ldr q9, [%[outptr5]]\n"
+                                "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0xf0]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5]]\n"
+                                "ldr q10, [%[outptr6]]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x120]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6]]\n"
+                                "ldr q11, [%[outptr7]]\n"
+                                "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x150]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7]]\n"
+                                "ldr q8, [%[outptr0], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x10]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x10]\n"
+                                "ldr q9, [%[outptr1], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x40]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x10]\n"
+                                "ldr q10, [%[outptr2], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x70]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x10]\n"
+                                "ldr q11, [%[outptr3], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xa0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x10]\n"
+                                "ldr q8, [%[outptr4], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xd0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x10]\n"
+                                "ldr q9, [%[outptr5], #0x10]\n"
+                                "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x100]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x10]\n"
+                                "ldr q10, [%[outptr6], #0x10]\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x130]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6], #0x10]\n"
+                                "ldr q11, [%[outptr7], #0x10]\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x160]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7], #0x10]\n"
+                                "ldr q8, [%[outptr0], #0x20]\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0x20]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr0], #0x20]\n"
+                                "ldr q9, [%[outptr1], #0x20]\n"
+                                "add %[outptr0], %[outptr0], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x50]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr1], #0x20]\n"
+                                "ldr q10, [%[outptr2], #0x20]\n"
+                                "add %[outptr1], %[outptr1], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x80]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr2], #0x20]\n"
+                                "ldr q11, [%[outptr3], #0x20]\n"
+                                "add %[outptr2], %[outptr2], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0xb0]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr3], #0x20]\n"
+                                "ldr q8, [%[outptr4], #0x20]\n"
+                                "add %[outptr3], %[outptr3], #0x30\n"
+                                "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+                                "ldr q4, [%[inptr], #0xe0]\n"
+                                "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+                                "str q8, [%[outptr4], #0x20]\n"
+                                "ldr q9, [%[outptr5], #0x20]\n"
+                                "add %[outptr4], %[outptr4], #0x30\n"
+                                "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+                                "ldr q5, [%[inptr], #0x110]\n"
+                                "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+                                "str q9, [%[outptr5], #0x20]\n"
+                                "ldr q10, [%[outptr6], #0x20]\n"
+                                "add %[outptr5], %[outptr5], #0x30\n"
+                                "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+                                "ldr q6, [%[inptr], #0x140]\n"
+                                "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+                                "str q10, [%[outptr6], #0x20]\n"
+                                "ldr q11, [%[outptr7], #0x20]\n"
+                                "add %[outptr6], %[outptr6], #0x30\n"
+                                "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+                                "ldr q7, [%[inptr], #0x170]\n"
+                                "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+                                "str q11, [%[outptr7], #0x20]\n"
+                                "add %[outptr7], %[outptr7], #0x30\n"
+                                "add %[inptr], %[inptr], #0x180\n"
+                            : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                              [inptr] "+r" (inptr)
+                            : [alpha] "w" (alpha), [beta] "w" (beta)
+                            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+                            );
+                        }
+                    }
+                    break;
+
+
+                }
+            }
+        }
+    }
+}
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
new file mode 100644
index 0000000..20824df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+
+namespace arm_gemm {
+
+template<unsigned int D>
+class NDRange {
+private:
+    unsigned int m_sizes[D];
+    unsigned int m_totalsizes[D];
+
+    class NDRangeIterator {
+    private:
+        const NDRange &m_parent;
+        unsigned int m_pos = 0;
+        unsigned int m_end = 0;
+
+    public:
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
+
+        bool done() const {
+            return (m_pos >= m_end);
+        }
+
+        unsigned int dim(unsigned int d) const {
+            unsigned int r = m_pos;
+
+            if (d < (D - 1)) {
+                r %= m_parent.m_totalsizes[d];
+            }
+
+            if (d > 0) {
+                r /= m_parent.m_totalsizes[d-1];
+            }
+
+            return r;
+        }
+
+        bool next_dim0() {
+            m_pos++;
+
+            return !done();
+        }
+
+        bool next_dim1() {
+            m_pos += m_parent.m_sizes[0] - dim(0);
+
+            return !done();
+        }
+
+        unsigned int dim0_max() const {
+            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
+
+            return dim(0) + offset;
+        }
+    };
+
+public:
+    template <typename... T>
+    NDRange(T... ts) : m_sizes{ts...} {
+        unsigned int t=1;
+
+        for (unsigned int i=0; i<D; i++) {
+            t *= m_sizes[i];
+
+            m_totalsizes[i] = t;
+        }
+    }
+
+    NDRangeIterator iterator(unsigned int start, unsigned int end) const {
+        return NDRangeIterator(*this, start, end);
+    }
+
+    unsigned int total_size() const {
+        return m_totalsizes[D - 1];
+    }
+
+    unsigned int get_size(unsigned int v) const {
+        return m_sizes[v];
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index e422b91..0330783 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,11 +81,14 @@
                     }
                 }
                 // "row" tail - row is out of range so fill with zeros always.
-                for (int row = 0; row < blank_rows; row++) {
-                    for (int col=0; col < (fill_cols + blank_cols); col++) {
-                        *out++ = static_cast<TOut>(0);
-                    }
+                TOut zeroval = static_cast<TOut>(0);
+                int pads = blank_rows * (fill_cols + blank_cols);
+
+                for (int i=0; i<pads; i++) {
+                    out[i] = zeroval;
                 }
+
+                out += pads;
             }
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 347eafb..0648ff6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 
 #include <arm_neon.h>
 
@@ -173,4 +173,4 @@
     }
 }
 
-#endif  // __aarch64__
+#endif  // __aarch64__ && !__ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index fc1f2c2..e1ebba0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,17 +23,14 @@
  */
 #include "a32_interleave_6way_32bit.hpp"
 #include "a32_transpose_interleave_8way_32bit.hpp"
-#ifdef __ARM_FEATURE_SVE
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#else
-#include "a64_interleave_8way_32bit.hpp"
-#endif
 #include "a64_block16_interleave4_8bit.hpp"
 #include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
 #include "a64_interleave_8way_half_to_float.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"
 #include "a64_transpose_interleave_24way_16bit.hpp"
-#include "transpose_interleave_common.hpp"
+#include "sve_interleave_8way_32bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
+#include "sve_interleave_8way_block4_8bit.hpp"
+#include "transpose_interleave_common.hpp"
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
index 752e837..07c8219 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
         long outpos = 0;
 
         uint32_t *outptr = master_outptr;
-        master_outptr += outwidth;
+        master_outptr += (outwidth * 1);
 
         const uint32_t *inptr0 = inptr + y * ldin + k0;
         const uint32_t *inptr1 = inptr0 + ldin;
@@ -60,52 +60,53 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
                     "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
                     "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip1 z0.s, z8.s, z4.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
                     "incw %[outpos], all, mul #1\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z4.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z4.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -115,60 +116,62 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
                     "zip1 z8.s, z0.s, z4.s\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "zip2 z9.s, z0.s, z4.s\n"
+                    "incw %[inpos], all, mul #1\n"
                     "zip1 z10.s, z1.s, z4.s\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip1 z0.s, z8.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
                     "incw %[outpos], all, mul #1\n"
+                    "zip1 z2.s, z9.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "mov z14.s, #0\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip1 z6.s, z11.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z7.s, z11.s, z14.s\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -178,63 +181,66 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "zip2 z9.s, z0.s, z4.s\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
                     "zip1 z10.s, z1.s, z4.s\n"
+                    "incw %[inpos], all, mul #1\n"
                     "zip2 z11.s, z1.s, z4.s\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z13.s\n"
                     "incw %[outpos], all, mul #1\n"
+                    "mov z14.s, #0\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z5.s, z10.s, z14.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip1 z6.s, z11.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z7.s, z11.s, z14.s\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -244,65 +250,69 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
                     "zip1 z10.s, z1.s, z4.s\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
                     "zip2 z11.s, z1.s, z4.s\n"
+                    "incw %[inpos], all, mul #1\n"
                     "zip1 z12.s, z2.s, z4.s\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip2 z13.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip1 z14.s, z3.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip2 z15.s, z3.s, z4.s\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z13.s\n"
                     "incw %[outpos], all, mul #1\n"
                     "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
                     "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -312,66 +322,71 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "incw %[inpos], all, mul #1\n"
                     "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
                     "zip2 z11.s, z1.s, z5.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
                     "zip1 z12.s, z2.s, z5.s\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip2 z13.s, z2.s, z5.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip1 z14.s, z3.s, z5.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip2 z15.s, z3.s, z5.s\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z13.s\n"
                     "incw %[outpos], all, mul #1\n"
                     "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
                     "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -381,67 +396,73 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "incw %[inpos], all, mul #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
                     "zip2 z13.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z14.s, z3.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip2 z15.s, z3.s, z6.s\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z13.s\n"
                     "incw %[outpos], all, mul #1\n"
                     "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
                     "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -451,68 +472,75 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
                     "incw %[inpos], all, mul #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip1 z14.s, z3.s, z7.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
                     "zip2 z9.s, z0.s, z4.s\n"
+                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
                     "zip1 z10.s, z1.s, z5.s\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip2 z11.s, z1.s, z5.s\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
                     "zip2 z15.s, z3.s, z7.s\n"
+                    "addvl %[inptr6], %[inptr6], #1\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z3.s, z9.s, z13.s\n"
                     "incw %[outpos], all, mul #1\n"
                     "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
                     "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 
@@ -522,69 +550,77 @@
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
                     "incw %[inpos], all, mul #1\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z8.s, z0.s, z4.s\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "ld1w z7.s, p0/z, [%[inptr7]]\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "addvl %[inptr6], %[inptr6], #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "addvl %[inptr7], %[inptr7], #1\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
                     "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
                     "zip1 z0.s, z8.s, z12.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z1.s, z8.s, z12.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z3.s, z9.s, z13.s\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip1 z4.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
                     "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
                     "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
                     "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
                     "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
                     "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
                     "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
 

diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index a1fc00e..f070780 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,7 @@
 
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif
+#include <cstddef>
 
 // Macro for unreachable code (e.g. impossible default cases on switch)
 #define UNREACHABLE(why)  __builtin_unreachable()
@@ -34,7 +32,8 @@
 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)
 
-inline int iceildiv(const int a, const int b) {
+template<typename T>
+inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
 }
 
@@ -49,13 +48,43 @@
     }
 }
 
+namespace arm_gemm {
+namespace utils {
+namespace {
+
+#ifdef __ARM_FEATURE_SVE
+template<size_t sz>
+inline unsigned long get_vector_length_sz() {
+    unsigned long v;
+
+    __asm (
+        "cntb	%0"
+        : "=r" (v)
+    );
+
+    return v / sz;
+}
+
+#define VEC_LEN_SPEC(sz, opcode) template <> inline unsigned long get_vector_length_sz<sz>() { unsigned long v; __asm ( opcode " %0" : "=r" (v)); return v; }
+
+VEC_LEN_SPEC(8, "cntd")
+VEC_LEN_SPEC(4, "cntw")
+VEC_LEN_SPEC(2, "cnth")
+VEC_LEN_SPEC(1, "cntb")
+#endif
+
+} // anonymous namespace
+
 template <typename T>
 inline unsigned long get_vector_length() {
 #ifdef __ARM_FEATURE_SVE
-    const unsigned long length = svcntb();
+    return get_vector_length_sz<sizeof(T)>();
 #else
-    const unsigned long length = 16;
+    return 16 / sizeof(T);
 #endif
+}
 
-    return length / sizeof(T);
-}
\ No newline at end of file
+} // utils namespace
+} // arm_gemm namespace
+
+using namespace arm_gemm::utils;
\ No newline at end of file

diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
index 09ac08c..3d8d66d 100644
--- a/src/core/NEON/kernels/assembly/Helpers.cpp
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,91 +24,47 @@
 
 #include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
 
-#include "NEGEMMInterleavedStrategies.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 
 namespace arm_compute
 {
-namespace
-{
-template <typename InputType, bool use_dot = false>
-BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
-    using strategy = typename Kernel<InputType, use_dot>::strategy;
-    return calculate_block_sizes<strategy>(ci, M, N, K);
-}
-} // namespace
-
-const char *get_strategy_name(DataType input_type, bool use_dot)
+arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
+                                          const CPUInfo                      &ci,
+                                          const unsigned int                  num_threads,
+                                          const INEGEMMWrapperKernel::Params &p,
+                                          float                               alpha,
+                                          float                               beta,
+                                          bool                                pretranspose_hint)
 {
     switch(input_type)
     {
-        case DataType::F32:
-            return Kernel<float>::name;
 #ifdef __aarch64__
-        case DataType::U8:
         case DataType::QASYMM8:
-            if(use_dot)
-            {
-                return Kernel<uint8_t, true>::name;
-            }
-            else
-            {
-                return Kernel<uint8_t, false>::name;
-            }
+        case DataType::U8:
+        {
+            arm_gemm::GemmArgs<uint32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            return arm_gemm::get_gemm_method<uint8_t, uint32_t>(args);
+        }
         case DataType::S8:
-            if(use_dot)
-            {
-                return Kernel<int8_t, true>::name;
-            }
-            else
-            {
-                return Kernel<int8_t, false>::name;
-            }
-#endif /* __aarch64__ */
+        {
+            arm_gemm::GemmArgs<int32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            return arm_gemm::get_gemm_method<int8_t, int32_t>(args);
+        }
+#endif // __aarch64__
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            return Kernel<__fp16>::name;
+        {
+            arm_gemm::GemmArgs<__fp16> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            return arm_gemm::get_gemm_method<__fp16, __fp16>(args);
+        }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("DataType not supported");
-            break;
-    }
-}
-
-BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot)
-{
-    switch(input_type)
-    {
         case DataType::F32:
-            return calculate_block_sizes_template<float>(ci, M, N, K);
-#ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(use_dot)
-            {
-                return calculate_block_sizes_template<uint8_t, true>(ci, M, N, K);
-            }
-            else
-            {
-                return calculate_block_sizes_template<uint8_t, false>(ci, M, N, K);
-            }
-        case DataType::S8:
-            if(use_dot)
-            {
-                return calculate_block_sizes_template<int8_t, true>(ci, M, N, K);
-            }
-            else
-            {
-                return calculate_block_sizes_template<int8_t, false>(ci, M, N, K);
-            }
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            return calculate_block_sizes_template<__fp16>(ci, M, N, K);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        {
+            arm_gemm::GemmArgs<float> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            return arm_gemm::get_gemm_method<float, float>(args);
+        }
         default:
-            ARM_COMPUTE_ERROR("DataType not supported");
-            break;
+            return arm_gemm::KernelDescription();
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
deleted file mode 100644
index 2c9cd32..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
+++ /dev/null

@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-namespace arm_compute
-{
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
-                                                                                const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    _prepared_a         = prepared_a;
-    _transformed_b      = transformed_b;
-    _tmp_c              = tmp_c;
-    _c                  = c;
-    _block_walker       = block_walker;
-    _block_sizes        = block_sizes;
-    _params             = params;
-    _b_is_pretransposed = b_is_pretransposed;
-    _alpha              = alpha;
-    _beta               = beta;
-
-    auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
-                                                                                const Coordinates &end_offset)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    strategy           strat(info.cpu_info);
-    TensorAccessor<To> prepared_a(*_prepared_a);
-    TensorAccessor<To> transformed_b(*_transformed_b);
-    TensorAccessor<Tr> c(*_c);
-    TensorAccessor<Tr> tmp_c(*_tmp_c);
-
-    int  prev_batch      = -1;
-    To *a_ptr           = nullptr;
-    auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
-    {
-        const unsigned int y     = id.x();
-        const unsigned int batch = id.y();
-        const unsigned int ymax  = std::min(_params.M, y + strategy::out_height());
-
-        // If it's the first block of a new batch then reset the pointer to A.
-        if(prev_batch != static_cast<int>(batch))
-        {
-            const unsigned int first_m = id.x();
-            a_ptr                      = prepared_a(0, first_m, batch);
-            prev_batch                 = batch;
-        }
-
-        // Call matrix multiply assembly routine to process the block:
-        strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
-        a_ptr += strategy::out_height() * wl._kern_k;
-
-        // Merge the result with the other blocks' results:
-        strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<Tr>(1)));
-    });
-    auto on_new_row_size = [&](unsigned int start, unsigned int end)
-    {
-        //Nothing to do
-    };
-    window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_workloads(std::vector<MatrixMultiplyWorkload> &workloads)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    unsigned int offset_transformed_b = 0;
-    execute_window_loop(_block_walker, [&](const Coordinates & id)
-    {
-        const unsigned int x0    = id.x();
-        const unsigned int k0    = id.y();
-        const unsigned int multi = id.z();
-
-        const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
-        const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
-
-        // Figure out how many "K" the kernel will actually process.
-        const int kern_k  = ceil_to_multiple(kmax - k0, strategy::k_unroll());
-        const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
-
-        workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
-
-        if(_b_is_pretransposed)
-        {
-            offset_transformed_b += bblocks * strategy::out_width() * kern_k;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Not supported");
-        }
-    });
-}
-
-//TODO: regroup somewhere ?
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float, float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t, true>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float16_t, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
deleted file mode 100644
index 6c201ce..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
+++ /dev/null

@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-// Call the lambda function for each workload generated by the passed window.
-template <typename To, bool use_dot, typename Lambda>
-void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes();
-    execute_window_loop(window, [&](const Coordinates & coordinates)
-    {
-        const unsigned int x0    = coordinates.x();
-        const unsigned int k0    = coordinates.y();
-        const unsigned int multi = coordinates.z();
-
-        const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi));
-        const unsigned int xmax     = std::min(x0 + window.x().step(), N);
-        const unsigned int kmax     = std::min(k0 + window.y().step(), K);
-
-        /* Figure out the size of each block. */
-        unsigned int x_size = (xmax - x0);
-        unsigned int k_size = (kmax - k0);
-
-        /* Round sizes up as needed. */
-        x_size = ceil_to_multiple(x_size, strategy::out_width());
-        k_size = ceil_to_multiple(k_size, strategy::k_unroll());
-
-        lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax));
-
-        //Each workload represents one block:
-        offset_transformed_b += (x_size * k_size * sizeof(To));
-    });
-}
-
-// Calculate the size of transformed_b:
-template <typename To, bool use_dot>
-unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    // How many full blocks do N / K contain ?
-    size_t num_full_k = K / bs.k_block;
-    size_t num_full_x = N / bs.x_block;
-
-    ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0);
-    ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0);
-
-    size_t normal_x_size = bs.x_block;
-    size_t normal_k_size = bs.k_block;
-
-    // Round up the leftovers to be a multiple of the strategy processing size:
-    size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width());
-    size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll());
-
-    // Calculate the total size of the buffer:
-    size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
-    total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
-    return total;
-}
-
-} // namespace
-
-template <typename To, bool use_dot>
-BlockSizes NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::block_sizes() const
-{
-    return _block_sizes;
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    const unsigned int multis = b->info()->tensor_shape().z();
-    _Nsize                    = b->info()->tensor_shape().x();
-    _Ksize                    = b->info()->tensor_shape().y();
-    _b                        = b;
-    _transformed_b            = transformed_b;
-    _transpose_b              = transpose_b;
-
-    _block_sizes = calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
-
-    auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ get_B_pretransposed_array_size<To, use_dot>(_Nsize, _Ksize, _block_sizes) }));
-
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block));
-    window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block));
-    window.set(Window::DimZ, Window::Dimension(0, multis));
-
-    INEKernel::configure(window);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::transform(const PrepareBWorkload &wl, const ThreadInfo &info)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    strategy strat(info.cpu_info);
-    strat.transforms.PrepareB(reinterpret_cast<To *>(_transformed_b->buffer() + wl._offset_transformed_b),
-                              reinterpret_cast<To *>(_b->buffer() + wl._offset_b),
-                              _b->info()->strides_in_bytes().y() / sizeof(To),
-                              wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::create_workloads(std::vector<PrepareBWorkload> &workloads)
-{
-    for_each_element_in_window<To, use_dot>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl)
-    {
-        workloads.push_back(std::move(wl));
-    });
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window());
-    for_each_element_in_window<To, use_dot>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl)
-    {
-        this->transform(wl, info);
-    });
-}
-
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t, true>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
index 69842fe..26d9e99 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,90 +44,184 @@
 
 namespace arm_compute
 {
-namespace
+namespace detail
 {
-template <typename To, bool use_dot = false>
-struct Kernel
+/** GEMM Interleaved Strategy interface */
+class IInterleavedStrategy
 {
+public:
+    /** Virtual Destructor */
+    virtual ~IInterleavedStrategy() = default;
+    /** Return output height of the interleaved strategy
+     *
+     * @return Output height of strategy
+     */
+    virtual unsigned int out_height() const = 0;
+    /** Instantiate and configure a prepareB Kernel
+     *
+     * @param[in] b             Input tensor B.
+     * @param[in] transformed_b Reshaped tensor B.
+     * @param[in] params        GM, N, K sizes.
+     * @param[in] ci            CPUInfo to be used for kernel configuration.
+     *
+     * @return A wrapped specialized prepareB kernel
+     */
+    virtual std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor                      *b,
+                                                                                         ITensor                            *transformed_b,
+                                                                                         const INEGEMMWrapperKernel::Params &params,
+                                                                                         const CPUInfo                      &ci) = 0;
+    /** Instantiate and configure a transformA Kernel
+     *
+     * @param[in] a             Input tensor A.
+     * @param[in] transformed_a Reshaped tensor A.
+     * @param[in] block_walker  Window representing the layout of the matrix's blocks.
+     * @param[in] params        M, N, K sizes.
+     *
+     * @return A wrapped specialized transformA kernel
+     */
+    virtual std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor                      *a,
+                                                                                       ITensor                            *transformed_a,
+                                                                                       const Window                       &block_walker,
+                                                                                       const INEGEMMWrapperKernel::Params &params) = 0;
+    /** Instantiate and configure a prepareB Kernel
+     *
+     * @param transformed_a  Already reshaped tensor A.
+     * @param transformed_b  Already reshaped tensor B.
+     * @param tmp_c          Temporary buffer to be used to store intermediate results.
+     * @param c              Result tensor C.
+     * @param block_walker   Window containing iteration information for the M and batch dimensions.
+     * @param block_sizes    Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
+     * @param params         M, N, K sizes.
+     * @param alpha          Alpha value
+     * @param beta           Beta value
+     * @param pretranspose_b Is B also pretransposed ?
+     * @param num_threads    Maximum number of threads that might be used for the calculations.
+     *
+     * @return A wrapped specialized MatrixMultiply kernel
+     */
+    virtual std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+                                                                                                const Window &block_walker, const BlockSizes &block_sizes,
+                                                                                                const INEGEMMWrapperKernel::Params &params, float alpha, float beta, bool pretranspose_b,
+                                                                                                unsigned int num_threads) = 0;
+    /** Calculates the block sizes of a given strategy
+     *
+     * @param[in] ci     CPUInfo to be used for kernel configuration.
+     * @param[in] params M, N, K sizes.
+     *
+     * @return BlockSizes for a given strategy
+     */
+    virtual BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) = 0;
 };
 
-#define DEFINE_STRATEGY_SUFFIX(strat, suffix)            \
-    using strategy                    = arm_gemm::strat; \
-    static constexpr const char *name = #strat suffix;
+/** Interleaved Strategy class */
+template <typename StrategyType>
+class InterleavedStrategy : public IInterleavedStrategy
+{
+public:
+    using strategy = StrategyType;
 
-#define DEFINE_STRATEGY(strat) \
-    DEFINE_STRATEGY_SUFFIX(strat, "")
+public:
+    // Inherited methods overridden
+    unsigned int out_height() const override
+    {
+        return strategy::out_height();
+    }
+    std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor                      *b,
+                                                                                 ITensor                            *transformed_b,
+                                                                                 const INEGEMMWrapperKernel::Params &params,
+                                                                                 const CPUInfo                      &ci) override
+    {
+        auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<strategy>>();
+        prepare_b->configure(b, transformed_b, false, ci, params);
+        return std::move(prepare_b);
+    }
+    std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor                      *a,
+                                                                               ITensor                            *transformed_a,
+                                                                               const Window                       &block_walker,
+                                                                               const INEGEMMWrapperKernel::Params &params) override
+    {
+        auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<strategy>>();
+        transform_a->configure(a, transformed_a, false, block_walker, params);
+        return std::move(transform_a);
+    }
+    std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+                                                                                        const Window &block_walker, const BlockSizes &block_sizes,
+                                                                                        const INEGEMMWrapperKernel::Params &params, float alpha, float beta, bool pretranspose_b,
+                                                                                        unsigned int num_threads) override
+    {
+        auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<strategy>>();
+        matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, num_threads);
+        return std::move(matrix_multiply);
+    }
 
-#ifdef __ARM_FEATURE_SVE
-template <>
-struct Kernel<float, false>
-{
-    DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8)
-};
-template <>
-struct Kernel<float16_t, false>
-{
-    DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<int8_t, use_dot>
-{
-    DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<uint8_t, use_dot>
-{
-    DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8)
-};
-#else /* __ARM_FEATURE_SVE */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-struct Kernel<float16_t, false>
-{
-    DEFINE_STRATEGY(hgemm_24x8)
-};
-#endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#ifdef __aarch64__
-template <>
-struct Kernel<float, false>
-{
-    DEFINE_STRATEGY(sgemm_12x8)
-};
-template <>
-struct Kernel<int8_t, false>
-{
-    DEFINE_STRATEGY(gemm_s8_4x4)
-};
-template <>
-struct Kernel<uint8_t, false>
-{
-    DEFINE_STRATEGY(gemm_u8_4x4)
+    BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) override
+    {
+        return calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
+    }
 };
 
-//Use different strategies for 8bit dot product:
-template <>
-struct Kernel<int8_t, true>
+/** Create the backend GEMM strategy to use given the provided kernel info
+ *
+ * @param[in] kernel_name Kernel name of the backend strategy to instantiate
+ *
+ * @return The requested kernel strategy if exists else nullptr
+ */
+std::unique_ptr<IInterleavedStrategy> create_strategy(const std::string &kernel_name)
 {
-    DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot")
-};
-template <>
-struct Kernel<uint8_t, true>
-{
-    DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot")
-};
-#else
-template <>
-struct Kernel<float, false>
-{
-    DEFINE_STRATEGY(sgemm_8x6)
-};
-#endif /* __aarch64__ */
-#endif /* __ARM_FEATURE_SVE */
-
-#undef DEFINE_STRATEGY
-#undef DEFINE_STRATEGY_SUFFIX
-
-} // namespace
+#if defined(__arm__)
+    if(kernel_name.find("sgemm_8x6") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_8x6>>();
+    }
+#endif // defined(__arm__)
+#if defined(__aarch64__)
+    if(kernel_name.find("gemm_s8_4x4") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_4x4>>();
+    }
+    if(kernel_name.find("gemm_s8_12x8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_12x8>>();
+    }
+    if(kernel_name.find("gemm_u8_4x4") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_4x4>>();
+    }
+    if(kernel_name.find("gemm_u8_12x8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_12x8>>();
+    }
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    if(kernel_name.find("hgemm_24x8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::hgemm_24x8>>();
+    }
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    if(kernel_name.find("sgemm_12x8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_12x8>>();
+    }
+#if defined(__ARM_FEATURE_SVE)
+    if(kernel_name.find("interleaved_fp16_mla_3VLx8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp16_mla_3VLx8>>();
+    }
+    if(kernel_name.find("interleaved_fp32_mla_3VLx8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp32_mla_3VLx8>>();
+    }
+    if(kernel_name.find("interleaved_s8s32_dot_3VLx8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_s8s32_dot_3VLx8>>();
+    }
+    if(kernel_name.find("interleaved_u8u32_dot_3VLx8") != std::string::npos)
+    {
+        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_u8u32_dot_3VLx8>>();
+    }
+#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(__aarch64__)_
+    return nullptr;
+}
+} // namespace detail
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__ */

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
deleted file mode 100644
index 3b80a1f..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
+++ /dev/null

@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker,
-                                                                        const INEGEMMWrapperKernel::Params &params)
-{
-    _a              = a;
-    _transformed_a  = transformed_a;
-    _transpose_a    = transpose_a;
-    _Ksize          = params.K;
-    _Msize          = params.M;
-    _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
-                                                                        const Coordinates &end_offset)
-{
-    using strategy = typename Kernel<To, use_dot>::strategy;
-
-    strategy           strat(info.cpu_info);
-    TensorAccessor<To> a(*_a);
-    TensorAccessor<To> transformed_a(*_transformed_a);
-
-    if(_a->info()->data_layout() == DataLayout::NHWC)
-    {
-        // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
-        // the relevant multiple of the row stride.
-        const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize;
-        a.set_stride(2, nhwc_batch_stride);
-    }
-
-    unsigned int last_m = 0;
-    //TODO: Create a new iterate_1D( DimY);
-    int  last_y          = -1;
-    auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
-    {
-        if(id.y() != last_y)
-        {
-            last_y               = id.y();
-            unsigned int batch   = id.y();
-            unsigned int first_m = id.x();
-
-            if(first_m >= last_m)
-                return;
-
-            strat.transforms.PrepareA(transformed_a(0, first_m, batch),
-                                      a(0, 0, batch, wl._multi),
-                                      a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a);
-        }
-    });
-    auto on_new_row_size = [&](unsigned int start, unsigned int end)
-    {
-        last_m = std::min(end, _Msize);
-    };
-    window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::create_workloads(std::vector<TransformAWorkload> &workloads)
-{
-    execute_window_loop(_k_multi_window, [&](const Coordinates & id)
-    {
-        const unsigned int k0    = id.x();
-        const unsigned int multi = id.y();
-        const unsigned int kmax  = std::min(k0 + _k_multi_window.x().step(), _Ksize);
-
-        workloads.push_back(TransformAWorkload(k0, kmax, multi));
-    });
-}
-
-template class NEGEMMInterleavedTransformAWrapperTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t, true>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedTransformAWrapperTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index e452dfb..7b1f3e7 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,11 +34,7 @@
 #include "../arm_gemm/mergeresults.hpp"
 #include "../arm_gemm/transform.hpp"
 
-#include "../arm_gemm/kernels/a32_sgemm_8x6.hpp"
-#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
 #include "../arm_gemm/kernels/a64_sgemm_native_16x4.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_pretransposed.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_trans.hpp"
 
 namespace arm_compute
 {

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 39dad8f..73eaf64 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Helpers.h"
 
+#include "arm_compute/core/Utils.h"
 #include "support/ToolchainSupport.h"
 
 #include <algorithm>
@@ -48,8 +49,10 @@
     std::string   out;
     std::ifstream fs;
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
         std::ios_base::openmode mode = std::ios::in;
 
@@ -68,11 +71,13 @@
         fs.seekg(0, std::ios::beg);
         // Copy the content of the file
         out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR("Accessing %s: %s", filename.c_str(), e.what());
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 
     return out;
 }
@@ -321,17 +326,19 @@
     return res;
 }
 
-PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout)
 {
-    const auto &strides         = conv_info.stride();
-    const int   out_width       = std::ceil(float(input_shape.x()) / float(strides.first));
-    const int   out_height      = std::ceil(float(input_shape.y()) / float(strides.second));
-    const int   pad_width       = ((out_width - 1) * strides.first + weights_shape.x() - input_shape.x());
-    const int   pad_height      = ((out_height - 1) * strides.second + weights_shape.y() - input_shape.y());
-    const int   same_pad_left   = pad_width / 2;
-    const int   same_pad_top    = pad_height / 2;
-    const int   same_pad_right  = pad_width - same_pad_left;
-    const int   same_pad_bottom = pad_height - same_pad_top;
+    const unsigned int width_idx       = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx      = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const auto        &strides         = conv_info.stride();
+    const int          out_width       = std::ceil(float(input_shape[width_idx]) / float(strides.first));
+    const int          out_height      = std::ceil(float(input_shape[height_idx]) / float(strides.second));
+    const int          pad_width       = ((out_width - 1) * strides.first + weights_shape[width_idx] - input_shape[width_idx]);
+    const int          pad_height      = ((out_height - 1) * strides.second + weights_shape[height_idx] - input_shape[height_idx]);
+    const int          same_pad_left   = pad_width / 2;
+    const int          same_pad_top    = pad_height / 2;
+    const int          same_pad_right  = pad_width - same_pad_left;
+    const int          same_pad_bottom = pad_height - same_pad_top;
 
     return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
 }
@@ -391,6 +398,7 @@
     return std::make_pair(w, h);
 }
 
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
 void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
 {
     switch(dt)
@@ -451,3 +459,4 @@
     }
     return 0;
 }
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */

diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index a4bce5d..08803c7 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp

@@ -23,13 +23,143 @@
  */
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+
 namespace arm_compute
 {
 namespace helpers
 {
 namespace tensor_transform
 {
-Coordinates slice_absolute_end_coords(TensorShape input_shape, Coordinates ends)
+int calculate_stride_on_index(int index, Coordinates strides)
+{
+    return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
+}
+
+int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+{
+    // Early exit
+    if(index >= static_cast<int>(starts.num_dimensions()))
+    {
+        return 0;
+    }
+
+    // Get stride
+    const int stride = calculate_stride_on_index(index, strides);
+
+    // Calculate start
+    int start = starts[index];
+
+    // Reset in case of begin mask present
+    if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+    {
+        start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
+    }
+
+    // Account negative start points
+    const int dim_size = input_shape[index];
+    if(start < 0)
+    {
+        start += dim_size;
+    }
+
+    // Final clamp
+    start = utility::clamp(start, 0, dim_size - 1);
+
+    return start;
+}
+
+int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
+                           Coordinates ends, Coordinates strides,
+                           int32_t end_mask, int32_t shrink_axis_mask)
+{
+    // Early exit
+    if(index >= static_cast<int>(ends.num_dimensions()))
+    {
+        return input_shape[index];
+    }
+
+    const int  stride      = calculate_stride_on_index(index, strides);
+    const bool shrink_axis = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, index);
+
+    // Calculate start
+    int stop = ends[index];
+
+    // Shrink dimension
+    if(shrink_axis)
+    {
+        stop = start_on_index + 1;
+    }
+
+    // Reset in case of begin mask present
+    if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+    {
+        stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
+    }
+
+    // Account negative end points
+    const int dim_size = input_shape[index];
+    if(stop < 0)
+    {
+        stop += dim_size;
+    }
+
+    // Final clamp
+    stop = (stride > 0) ? utility::clamp(stop, 0, dim_size) : utility::clamp(stop, -1, dim_size - 1);
+
+    return stop;
+}
+
+std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
+                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
+                                                                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    Coordinates starts_abs, ends_abs, final_strides;
+    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    {
+        const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
+        starts_abs.set(i, start_i);
+        ends_abs.set(i, calculate_end_on_index(input_shape, i, start_i, ends, strides, end_mask, shrink_axis_mask));
+        final_strides.set(i, calculate_stride_on_index(i, strides));
+    }
+
+    return std::make_tuple(starts_abs, ends_abs, final_strides);
+}
+
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
+                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+{
+    unsigned int index = 0;
+
+    TensorShape output_shape;
+    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    {
+        const int stride = calculate_stride_on_index(index, strides);
+        const int start  = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
+        const int end    = calculate_end_on_index(input_shape, i, start, ends, strides, end_mask, shrink_axis_mask);
+        const int range  = end - start;
+
+        const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
+        if(return_unshrinked || !is_shrink)
+        {
+            if((range == 0) ||               // Zero range
+               (range < 0 && stride >= 0) || // Negative range with positive stride
+               (range > 0 && stride <= 0))   // Positive range with negative stride
+            {
+                output_shape.set(index, 0);
+                return output_shape;
+            }
+            else
+            {
+                int dim = range / stride + (range % stride != 0 ? 1 : 0);
+                output_shape.set(index++, dim);
+            }
+        }
+    }
+    return output_shape;
+}
+
+int32_t construct_slice_end_mask(Coordinates ends)
 {
     // Create end mask
     int32_t end_mask = 0;
@@ -40,126 +170,8 @@
             end_mask |= 1 << i;
         }
     }
-    // Get unit strides
-    const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
 
-    return strided_slice_absolute_end_coords(input_shape, Coordinates(), ends, unit_strides, end_mask);
-}
-
-TensorShape compute_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends_abs)
-{
-    // Get unit strides
-    const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
-    return compute_strided_slice_output_shape(input_shape, starts, ends_abs, unit_strides);
-}
-
-Coordinates strided_slice_absolute_start_coords(TensorShape input_shape, Coordinates starts, Coordinates strides, int32_t begin_mask)
-{
-    Coordinates starts_abs;
-    for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
-    {
-        // Get start index
-        int start_i = starts[i];
-
-        // Reset in case of begin mask present
-        if((begin_mask & 1 << i) != 0)
-        {
-            start_i = strides[i] > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
-        }
-
-        // Account negative start points
-        const int dim_size = input_shape[i];
-        if(start_i < 0)
-        {
-            start_i += dim_size;
-        }
-
-        // Final clamp
-        start_i = utility::clamp(start_i, 0, dim_size - 1);
-        starts_abs.set(i, start_i);
-    }
-
-    // Fill remaining
-    for(unsigned int i = starts_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
-    {
-        starts_abs.set(i, 0);
-    }
-
-    return starts_abs;
-}
-
-Coordinates strided_slice_absolute_end_coords(TensorShape input_shape, Coordinates starts_abs, Coordinates ends, Coordinates strides,
-                                              int32_t end_mask, int32_t shrink_axis_mask)
-{
-    Coordinates ends_abs;
-    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
-    {
-        // Get end index
-        int stop_i = ends[i];
-
-        // Shrink dimension
-        if((shrink_axis_mask & (1 << i)) != 0)
-        {
-            stop_i = starts_abs[i] + 1;
-        }
-
-        // Reset in case of begin mask present
-        if((end_mask & 1 << i) != 0)
-        {
-            stop_i = (strides[i] > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
-        }
-
-        // Account negative end points
-        const int dim_size = input_shape[i];
-        if(stop_i < 0)
-        {
-            stop_i += dim_size;
-        }
-
-        // Final clamp
-        stop_i = (strides[i] > 0) ? utility::clamp(stop_i, 0, dim_size) : utility::clamp(stop_i, -1, dim_size - 1);
-        ends_abs.set(i, stop_i);
-    }
-
-    // Fill remaining ends
-    for(unsigned int i = ends_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
-    {
-        ends_abs.set(i, input_shape[i]);
-    }
-
-    return ends_abs;
-}
-
-Coordinates strided_slice_strides(TensorShape input_shape, Coordinates strides)
-{
-    for(unsigned int i = strides.num_dimensions(); i < input_shape.num_dimensions(); ++i)
-    {
-        strides.set(i, 1);
-    }
-    return strides;
-}
-
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts_abs, Coordinates ends_abs, Coordinates final_strides)
-{
-    TensorShape output_shape = input_shape;
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
-    {
-        const int stride_i = final_strides[i];
-        const int range    = ends_abs[i] - starts_abs[i];
-        if((range == 0) ||                 // Zero range
-           (range < 0 && stride_i >= 0) || // Negative range with positive stride
-           (range > 0 && stride_i <= 0))   // Positive range with negative stride
-        {
-            output_shape.set(i, 0);
-            return output_shape;
-        }
-        else
-        {
-            int dim = range / stride_i + (range % stride_i != 0 ? 1 : 0);
-            output_shape.set(i, dim);
-        }
-    }
-    return output_shape;
+    return end_mask;
 }
 } // namespace tensor_transform
 } // namespace helpers

diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index b2ca28d..a944d2c 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -310,8 +310,8 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend, PadStrideInfo conv_info,
-                                                    DepthwiseConvolutionMethod method,
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
+                                                    PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
                                                     ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
 {
     CHECK_NODEIDX_PAIR(input, g);
@@ -327,7 +327,7 @@
     w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
     w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
     w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
-                     get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+                     get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
     if(!quant_info.empty())
     {
         w_desc.quant_info = quant_info;
@@ -340,7 +340,7 @@
     if(has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
-        b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+        b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
 
         if(is_data_type_quantized_asymmetric(b_desc.data_type))
         {
@@ -351,7 +351,7 @@
     }
 
     // Create convolution node and connect
-    NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, method);
+    NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
     if(has_bias)
@@ -362,6 +362,22 @@
 
     return conv_nid;
 }
+NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info)
+{
+    CHECK_NODEIDX_PAIR(input_loc, g);
+    CHECK_NODEIDX_PAIR(input_conf, g);
+    CHECK_NODEIDX_PAIR(input_priorbox, g);
+
+    // Create detection_output node and connect
+    NodeID detect_nid = g.add_node<DetectionOutputLayerNode>(detect_info);
+    g.add_connection(input_loc.node_id, input_loc.index, detect_nid, 0);
+    g.add_connection(input_conf.node_id, input_conf.index, detect_nid, 1);
+    g.add_connection(input_priorbox.node_id, input_priorbox.index, detect_nid, 2);
+
+    set_node_params(g, detect_nid, params);
+
+    return detect_nid;
+}
 
 NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
 {

diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 30a3546..e0ba7e2 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp

@@ -38,14 +38,19 @@
         { "qasymm8", DataType::QASYMM8 },
     };
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         return data_types.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
 
 arm_compute::DataLayout data_layout_from_name(const std::string &name)
@@ -56,14 +61,19 @@
         { "nchw", DataLayout::NCHW },
     };
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         return data_layouts.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
 namespace graph
 {
@@ -73,17 +83,22 @@
     {
         { "neon", Target::NEON },
         { "cl", Target::CL },
-        { "gles", Target::GC },
+        { "gc", Target::GC },
     };
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         return targets.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c37a137..b9e3ddc 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -68,6 +69,94 @@
     using Subtraction    = CLArithmeticSubtraction;
     using Multiplication = CLPixelWiseMultiplication;
 };
+// TODO (isagot01): Remove once we support heterogeneous scheduling at function level
+/** Wrapper for the CPP Function in the OpenCL backend **/
+class CPPWrapperFunction : public IFunction
+{
+public:
+    /* Default constructor */
+    CPPWrapperFunction()
+        : _tensors(), _func(nullptr)
+    {
+    }
+
+    void run() override
+    {
+        for(auto &tensor : _tensors)
+        {
+            tensor->map(CLScheduler::get().queue());
+        }
+        _func->run();
+
+        for(auto &tensor : _tensors)
+        {
+            tensor->unmap(CLScheduler::get().queue());
+        }
+    }
+
+    void register_tensor(ICLTensor *tensor)
+    {
+        _tensors.push_back(tensor);
+    }
+
+    void register_function(std::unique_ptr<IFunction> function)
+    {
+        _func = std::move(function);
+    }
+
+private:
+    std::vector<arm_compute::ICLTensor *> _tensors;
+    std::unique_ptr<IFunction>            _func;
+};
+
+namespace detail
+{
+// Specialized functions
+template <>
+std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+{
+    validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    CLTargetInfo::TensorType      *input0      = get_backing_tensor<CLTargetInfo>(node.input(0));
+    CLTargetInfo::TensorType      *input1      = get_backing_tensor<CLTargetInfo>(node.input(1));
+    CLTargetInfo::TensorType      *input2      = get_backing_tensor<CLTargetInfo>(node.input(2));
+    CLTargetInfo::TensorType      *output      = get_backing_tensor<CLTargetInfo>(node.output(0));
+    const DetectionOutputLayerInfo detect_info = node.detection_output_info();
+
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CPPDetectionOutputLayer>();
+    func->configure(input0, input1, input2, output, detect_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type()
+                               << " Input0 shape: " << input0->info()->tensor_shape()
+                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << " Input2 shape: " << input2->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " DetectionOutputLayer info: " << detect_info
+                               << std::endl);
+
+    auto wrap_function = support::cpp14::make_unique<CPPWrapperFunction>();
+    ;
+    wrap_function->register_function(std::move(func));
+    wrap_function->register_tensor(input0);
+    wrap_function->register_tensor(input1);
+    wrap_function->register_tensor(input2);
+    wrap_function->register_tensor(output);
+
+    return std::move(wrap_function);
+}
+} // namespace detail
 
 std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
 {
@@ -95,6 +184,8 @@
             return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index a070973..4b71837 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -59,6 +60,8 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
                    CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:

diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 2ca453e..0de58f5 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp

@@ -176,8 +176,8 @@
 
     const PadStrideInfo              conv_info        = node.convolution_info();
     const DepthwiseConvolutionMethod dwc_algorithm    = node.depthwise_convolution_method();
-    const unsigned int               depth_multiplier = 1;
     const ActivationLayerInfo        fused_act        = node.fused_activation();
+    const int                        depth_multiplier = node.depth_multiplier();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::unique_ptr<IFunction> func;
@@ -204,6 +204,7 @@
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << " Depth multiplier: " << depth_multiplier
                                << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;

diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index fe69c7a..f15ede6 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,6 +111,8 @@
             return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DetectionOutputLayer");
         case NodeType::FlattenLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
         case NodeType::GenerateProposalsLayer:

diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 23ced2f..f94cd97 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index ca8d485..dc987dd 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "support/ToolchainSupport.h"
 
@@ -77,7 +78,7 @@
 
 namespace detail
 {
-// Specialize functions
+// Specialized functions
 template <>
 std::unique_ptr<IFunction> create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(ConvolutionLayerNode &node,
                                                                                                GraphContext &ctx)
@@ -201,6 +202,8 @@
             return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:

diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a2abc83..b0feec5 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/graph/nodes/Nodes.h"
 
 #include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
 using namespace arm_compute::utils::cast;
@@ -59,6 +60,8 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
                    NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
         case NodeType::NormalizePlanarYUVLayer:

diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index f2c381b..767154b 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp

@@ -135,6 +135,9 @@
     workload.graph = &g;
     workload.ctx   = &ctx;
 
+    // Reserve memory for tasks
+    workload.tasks.reserve(node_order.size());
+
     // Create tasks
     for(auto &node_id : node_order)
     {
@@ -146,10 +149,7 @@
             std::unique_ptr<IFunction> func            = backend.configure_node(*node, ctx);
             if(func != nullptr)
             {
-                ExecutionTask task;
-                task.task = std::move(func);
-                task.node = node;
-                workload.tasks.push_back(std::move(task));
+                workload.tasks.emplace_back(ExecutionTask(std::move(func), node));
             }
         }
     }

diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 02d1632..75ca5f4 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp

@@ -32,13 +32,18 @@
 {
 namespace graph
 {
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
-    : _info(std::move(info)), _method(method), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method)
+    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
 }
 
+int DepthwiseConvolutionLayerNode::depth_multiplier() const
+{
+    return _depth_multiplier;
+}
+
 void DepthwiseConvolutionLayerNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
 {
     _method = method;
@@ -66,21 +71,24 @@
 
 TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                                           const TensorDescriptor &weights_descriptor,
-                                                                          const PadStrideInfo    &info)
+                                                                          const PadStrideInfo    &info,
+                                                                          int                     depth_multiplier)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
 
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int input_width    = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height   = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int input_channels = get_dimension_size(input_descriptor, DataLayoutDimension::CHANNEL);
+    const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
     std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
 
     return output_descriptor;
 }
@@ -105,7 +113,7 @@
 
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
-    return compute_output_descriptor(src->desc(), weights->desc(), _info);
+    return compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
 }
 
 NodeType DepthwiseConvolutionLayerNode::type() const

diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
new file mode 100644
index 0000000..c2d9f24
--- /dev/null
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
+    : _info(detection_info)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const
+{
+    return _info;
+}
+
+TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor         &input_descriptor,
+                                                                     const DetectionOutputLayerInfo &info)
+{
+    const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(0, detection_size);
+    output_descriptor.shape.set(1, max_size);
+
+    return output_descriptor;
+}
+
+bool DetectionOutputLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor DetectionOutputLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *input0 = input(0);
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+
+    return compute_output_descriptor(input0->desc(), _info);
+}
+
+NodeType DetectionOutputLayerNode::type() const
+{
+    return NodeType::DetectionOutputLayer;
+}
+
+void DetectionOutputLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index 7367e80..dabfc5a 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index 3a29e4c..bfc009d 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/graph/nodes/SliceLayerNode.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
@@ -52,16 +52,12 @@
 TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                            const Coordinates &starts, const Coordinates &ends)
 {
-    // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input_descriptor.shape, ends);
+    using namespace arm_compute::helpers::tensor_transform;
 
-    TensorDescriptor output_descriptor = input_descriptor;
-    for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
-    {
-        output_descriptor.shape.set(i, ends_abs[i] - starts[i]);
-    }
+    TensorDescriptor output_desc = input_descriptor;
+    output_desc.shape            = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
 
-    return output_descriptor;
+    return output_desc;
 }
 
 bool SliceLayerNode::forward_descriptors()

diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 7f0e374..d9de11e 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp

@@ -44,6 +44,5 @@
 
 std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
 {
-    ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment);
 }
\ No newline at end of file

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 2a4ab6e..c5d42b1 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,6 @@
 #include <algorithm>
 #include <cmath>
 #include <map>
-#include <vector>
 
 using namespace arm_compute;
 
@@ -62,19 +61,21 @@
     {
         return ba.max_size > bb.max_size;
     });
-    std::vector<size_t> group_sizes;
+
+    // Create group sizes vector
+    std::vector<BlobInfo> group_sizes;
     std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
     {
-        return b.max_size;
+        return BlobInfo(b.max_size, b.max_alignment);
     });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
-    _blobs.resize(max_size, 0);
-    group_sizes.resize(max_size, 0);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+    _blobs.resize(max_size);
+    group_sizes.resize(max_size);
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
     {
-        return std::max(lhs, rhs);
+        return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
     });
 
     // Calculate group mappings

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index e09451c..812cbdd 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -33,11 +33,11 @@
 
 using namespace arm_compute;
 
-BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
-    : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob_info)
+    : _allocator(allocator), _blobs(), _blob_info(std::move(blob_info))
 {
     ARM_COMPUTE_ERROR_ON(!allocator);
-    allocate_blobs(_blob_sizes);
+    allocate_blobs(_blob_info);
 }
 
 BlobMemoryPool::~BlobMemoryPool()
@@ -73,16 +73,16 @@
 std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info);
 }
 
-void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &size : sizes)
+    for(const auto &bi : blob_info)
     {
-        _blobs.push_back(_allocator->make_region(size, 0));
+        _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }
 }
 

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
new file mode 100644
index 0000000..533e6fa
--- /dev/null
+++ b/src/runtime/CL/CLHelpers.cpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+
+namespace
+{
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+    printf("%.*s", len, buffer);
+}
+#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+
+/** This initialises the properties vector with the configuration to be used when creating the opencl context
+ *
+ * @param[in] platform The opencl platform used to create the context
+ * @param[in] device   The opencl device to be used to create the context
+ * @param[in] prop     An array of properties to be initialised
+ *
+ * @note In debug builds, this function will enable cl_arm_printf if it's supported.
+ *
+ * @return A pointer to the context properties which can be used to create an opencl context
+ */
+
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+{
+    ARM_COMPUTE_UNUSED(device);
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+    // Query devices in the context for cl_arm_printf support
+    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    {
+        // Create a cl_context with a printf_callback and user specified buffer size.
+        cl_context_properties properties_printf[] =
+        {
+            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+            // Enable a printf callback function for this context.
+            CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+            // Request a minimum printf buffer size of 4MB for devices in the
+            // context that support this extension.
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+            0
+        };
+        std::copy_n(properties_printf, 7, prop);
+    }
+    else
+#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
+    {
+        cl_context_properties properties[] =
+        {
+            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+            0
+        };
+        std::copy_n(properties, 3, prop);
+    };
+}
+} //namespace
+
+namespace arm_compute
+{
+std::tuple<cl::Context, cl::Device, cl_int>
+create_opencl_context_and_device()
+{
+    ARM_COMPUTE_ERROR_ON(!opencl_is_available());
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
+    cl::Platform            p = platforms[0];
+    cl::Device              device;
+    std::vector<cl::Device> platform_devices;
+    p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
+    ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
+    device                              = platform_devices[0];
+    cl_int                err           = CL_SUCCESS;
+    cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+    initialise_context_properties(p, device, properties);
+    cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+    ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+    return std::make_tuple(cl_context, device, err);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index a311c6f..701ffe0 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,14 @@
  */
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/Tuners.h"
 
 using namespace arm_compute;
 
-namespace
-{
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
-{
-    printf("%.*s", len, buffer);
-}
-#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
-} // namespace
-
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
@@ -53,53 +45,30 @@
     return scheduler;
 }
 
+void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner)
+{
+    if(!_is_initialised)
+    {
+        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+        init(ctx, queue, device, cl_tuner);
+        _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+        _cl_tuner                = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+    }
+}
+
 void CLScheduler::default_init(ICLTuner *cl_tuner)
 {
     if(!_is_initialised)
     {
-        std::vector<cl::Platform> platforms;
-        cl::Platform::get(&platforms);
-        ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
-        cl::Platform            p = platforms[0];
-        cl::Context             ctx;
-        cl::Device              device;
-        std::vector<cl::Device> platform_devices;
-        p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
-        ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-        device = platform_devices[0];
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-
-        // Query devices in the context for cl_arm_printf support
-        if(device_supports_extension(device, "cl_arm_printf"))
-        {
-            // Create a cl_context with a printf_callback and user specified buffer size.
-            cl_context_properties properties[] =
-            {
-                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
-                // Enable a printf callback function for this context.
-                CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
-                // Request a minimum printf buffer size of 4MB for devices in the
-                // context that support this extension.
-                CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-                0
-            };
-            ctx = cl::Context(device, properties);
-        }
-        else
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
-        {
-            cl_context_properties properties[] =
-            {
-                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
-                0
-            };
-            ctx = cl::Context(device, properties);
-        };
-
-        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
-        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
-        init(ctx, queue, device, cl_tuner);
-
+        cl::Context ctx;
+        cl::Device  dev;
+        cl_int      err;
+        std::tie(ctx, dev, err) = create_opencl_context_and_device();
+        ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+        cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, dev);
+        init(ctx, queue, dev, cl_tuner);
         // Create a default static tuner and set if none was provided
         _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
     }
@@ -108,6 +77,21 @@
     _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
 }
 
+void CLScheduler::set_context(cl::Context context)
+{
+    _context = std::move(context);
+    CLKernelLibrary::get().set_context(_context);
+}
+
+void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner)
+{
+    set_context(std::move(context));
+    _queue          = std::move(queue);
+    _target         = get_target_from_device(device);
+    _is_initialised = true;
+    _cl_tuner       = cl_tuner;
+}
+
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f82cd3..a262d6b 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,10 +33,40 @@
 #include <limits>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+/** Utility function used to initialize the LWS values to test.
+ *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+ *
+ * @param[in, out] lws         Vector of LWS to test for a specific dimension
+ * @param[in]      gws         Size of the GWS
+ * @param[in]      lws_max     Max LKWS value allowed to be tested
+ * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+ */
+void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if(mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+} // namespace
 
 CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
 {
 }
 
@@ -102,32 +132,35 @@
 
 cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
 {
+    // Profiling queue
+    cl::CommandQueue queue_profiler;
+
+    // Extract real OpenCL function to intercept
     if(real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-
-        // Get the default queue
-        _queue = CLScheduler::get().queue();
-
-        // Check if we can use the OpenCL timer with the default queue
-        cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
-
-        if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
-        {
-            // Set the queue for profiling
-            _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
-        }
-        else
-        {
-            _queue_profiler = _queue;
-        }
     }
+
+    // Get the default queue
+    cl::CommandQueue default_queue = CLScheduler::get().queue();
+
+    // Check if we can use the OpenCL timer with the default queue
+    cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    {
+        // Set the queue for profiling
+        queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+    }
+    else
+    {
+        queue_profiler = default_queue;
+    }
+
     // Start intercepting enqueues:
     auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
                               const cl_event * event_wait_list, cl_event * event)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
-        ARM_COMPUTE_UNUSED(event);
         if(this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
@@ -139,49 +172,45 @@
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
+        if(event != nullptr)
+        {
+            //return cl_event from the intercepted call
+            clRetainEvent(tmp);
+            *event = tmp;
+        }
         return retval;
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
     cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 
+    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
     cl::NDRange opt_lws = cl::NullRange;
 
-    const int x_step = std::max(1, kernel.window().x().step());
-    const int y_step = std::max(1, kernel.window().y().step());
-    const int z_step = std::max(1, kernel.window().z().step());
-    const int x_end  = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
-    const int y_end  = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
-    const int z_end  = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
+    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
+    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
 
-    // First run using the default LWS
+    std::vector<unsigned int> lws_x;
+    std::vector<unsigned int> lws_y;
+    std::vector<unsigned int> lws_z;
+
+    // Initialize the LWS values to test
+    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
+    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
+    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+
+    for(const auto &z : lws_z)
     {
-        cl::NDRange lws_test = cl::NullRange;
-
-        kernel.set_lws_hint(lws_test);
-
-        // Run the kernel
-        kernel.run(kernel.window(), _queue_profiler);
-
-        _queue_profiler.finish();
-
-        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-        const cl_ulong diff  = end - start;
-        _kernel_event        = nullptr;
-
-        min_exec_time = diff;
-    }
-
-    for(int z = 1; z <= z_end; ++z)
-    {
-        for(int y = 1; y <= y_end; ++y)
+        for(const auto &y : lws_y)
         {
-            for(int x = 1; x <= x_end; ++x)
+            for(const auto &x : lws_x)
             {
                 cl::NDRange lws_test = cl::NDRange(x, y, z);
 
-                const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
 
                 if(invalid_lws)
                 {
@@ -192,9 +221,9 @@
                 kernel.set_lws_hint(lws_test);
 
                 // Run the kernel
-                kernel.run(kernel.window(), _queue_profiler);
+                kernel.run(kernel.window(), queue_profiler);
 
-                _queue_profiler.finish();
+                queue_profiler.finish();
 
                 const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
                 const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
@@ -278,3 +307,4 @@
     }
     fs.close();
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+    k->configure(input, output, axis, op);
+    _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}

diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}

diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}

diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+    k->configure(input, output, policy, 0);
+    _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, operation);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+    return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, COP);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,6 +158,18 @@
     _scaled_output.allocator()->allocate();
 }
 
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                     const WeightsInfo &weights_info)
+{
+    configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                      const WeightsInfo &weights_info)
+{
+    return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
 void CLDeconvolutionLayer::run()
 {
     prepare();

diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }

diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp

@@ -28,8 +28,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
 {
     return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
 {
 }
 
@@ -47,25 +50,79 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    if(input->info()->data_layout() == DataLayout::NCHW)
+    const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+    _needs_permute         = is_nhwc && (depth_multiplier > 1);
+    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+                             && is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared      = false;
+    _original_weights = weights;
+
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = output;
+
+    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(_needs_permute)
     {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+        output_to_use  = &_permuted_output;
+
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
+    else if(is_nhwc)
+    {
+        if(_needs_weights_reshape)
+        {
+            _reshape_weights.configure(weights, &_permuted_weights, info);
+            weights_to_use = &_permuted_weights;
+        }
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+    }
     else
     {
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
 
+    // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
 
+    // Permute output if needed
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
     // Configure border handler
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    if(input->data_layout() == DataLayout::NCHW)
+    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(needs_permute)
     {
-        return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+    }
+    else if(is_nhwc)
+    {
+        if(needs_weights_reshape)
+        {
+            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+                                                                                           act_info));
+        }
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
     }
 
-    return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    return Status{};
 }
 
 void CLDepthwiseConvolutionLayer3x3::run()
 {
+    prepare();
+
+    _memory_group.acquire();
+
+    if(_needs_permute)
+    {
+        _permute_input_to_nchw.run();
+    }
     CLScheduler::get().enqueue(_border_handler);
     CLScheduler::get().enqueue(*_kernel);
+
+    if(_needs_permute)
+    {
+        _permute_output_to_nhwc.run();
+    }
+
+    _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nchw.run();
+            _original_weights->mark_as_unused();
+        }
+
+        if(_needs_weights_reshape)
+        {
+            ARM_COMPUTE_ERROR_ON(_needs_permute);
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+            _permuted_weights.allocator()->allocate();
+            CLScheduler::get().enqueue(_reshape_weights);
+            _original_weights->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
-      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+      _optimised_function(nullptr)
 {
 }
 
@@ -104,98 +237,110 @@
 
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const size_t weights_w = weights->info()->dimension(idx_w);
-    const size_t weights_h = weights->info()->dimension(idx_h);
-    const size_t weights_z = weights->info()->dimension(idx_c);
+    const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
-    _is_prepared      = false;
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    bool            append_bias = (biases != nullptr) && !_is_quantized;
-    const GPUTarget gpu_target  = CLScheduler::get().target();
-
-    // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    // Output width and height
-    const unsigned int conv_w = output_shape[idx_w];
-    const unsigned int conv_h = output_shape[idx_h];
-
-    // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t conv_size  = conv_w * conv_h;
-
-    // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
-    CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
-    // Weights reshape configuration
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
-    // GEMV configuration
-    DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    _v2mm_kernel.set_target(gpu_target);
-    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    CLScheduler::get().tune_kernel_static(_v2mm_kernel);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
-    // Output staged configuration
-    if(_is_quantized)
+    if(bool(can_run_optimised_3x3_kernel))
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
-        _output_reshaped.allocator()->allocate();
+        auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        _optimised_function = std::move(f);
     }
-
-    // Fill borders on inputs
-    PixelValue zero_in(static_cast<int32_t>(0));
-    PixelValue zero_w(static_cast<int32_t>(0));
-    if(_is_quantized)
+    else
     {
-        zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
-        zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
-    }
-    BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+        const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+        const size_t weights_w = weights->info()->dimension(idx_w);
+        const size_t weights_h = weights->info()->dimension(idx_h);
+        const size_t weights_z = weights->info()->dimension(idx_c);
 
-    // Allocate intermediate tensors
-    _input_reshaped.allocator()->allocate();
-    _v2mm_output.allocator()->allocate();
+        _is_prepared      = false;
+        _original_weights = weights;
+        _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
+        bool            append_bias = (biases != nullptr) && !_is_quantized;
+        const GPUTarget gpu_target  = CLScheduler::get().target();
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        // Calculate output shape
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+        // Output width and height
+        const unsigned int conv_w = output_shape[idx_w];
+        const unsigned int conv_h = output_shape[idx_h];
+
+        // Set up intermediate tensors
+        const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t conv_size  = conv_w * conv_h;
+
+        // Im2Col configuration
+        TensorShape shape_im2col = input->info()->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _im2col_kernel.set_target(gpu_target);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+        // Weights reshape configuration
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+        // GEMV configuration
+        DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+        TensorShape shape_v2mm_out = input->info()->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        _v2mm_kernel.set_target(gpu_target);
+        _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+        CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+        _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+        // Output staged configuration
+        if(_is_quantized)
+        {
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+            _output_reshaped.allocator()->allocate();
+        }
+
+        // Fill borders on inputs
+        PixelValue zero_in(static_cast<int32_t>(0));
+        PixelValue zero_w(static_cast<int32_t>(0));
+        if(_is_quantized)
+        {
+            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+        }
+        BorderSize border_size = _v2mm_kernel.border_size();
+        _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+        border_size.bottom = 0;
+        _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+        // Allocate intermediate tensors
+        _input_reshaped.allocator()->allocate();
+        _v2mm_output.allocator()->allocate();
+
+        //Configure Activation Layer
+        _is_activationlayer_enabled = act_info.enabled();
+
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.configure(output, nullptr, act_info);
+        }
     }
 }
 
@@ -204,55 +349,64 @@
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+    const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
-    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
-    const size_t       weights_w    = weights->dimension(idx_w);
-    const size_t       weights_h    = weights->dimension(idx_h);
-    const size_t       weights_z    = weights->dimension(idx_c);
-    const unsigned int conv_w       = output_shape[idx_w];
-    const unsigned int conv_h       = output_shape[idx_h];
-    const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t       conv_size    = conv_w * conv_h;
-
-    TensorShape shape_im2col = input->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
-    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-    TensorShape shape_v2mm_out = input->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
-    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
-    if(is_quantized)
+    if(can_run_optimised_3x3_kernel)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
-    }
+        const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    // Validate Activation Layer
-    if(act_info.enabled())
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+        const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        const bool         append_bias  = (biases != nullptr) && !is_quantized;
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const size_t       weights_w    = weights->dimension(idx_w);
+        const size_t       weights_h    = weights->dimension(idx_h);
+        const size_t       weights_z    = weights->dimension(idx_c);
+        const unsigned int conv_w       = output_shape[idx_w];
+        const unsigned int conv_h       = output_shape[idx_h];
+        const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t       conv_size    = conv_w * conv_h;
+
+        TensorShape shape_im2col = input->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+        DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+        TensorShape shape_v2mm_out = input->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+        TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+        }
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+    else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
     }
-
     return Status{};
 }
 
@@ -260,33 +414,48 @@
 {
     prepare();
 
-    CLScheduler::get().enqueue(_im2col_kernel);
-    CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_kernel);
-    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-    if(_is_quantized)
+    if(_optimised_function != nullptr)
     {
-        CLScheduler::get().enqueue(_output_stage_kernel);
+        _optimised_function->run();
     }
-    if(_is_activationlayer_enabled)
+    else
     {
-        _activationlayer_function.run();
+        CLScheduler::get().enqueue(_im2col_kernel);
+        CLScheduler::get().enqueue(_v2mm_input_fill_border);
+        CLScheduler::get().enqueue(_v2mm_kernel);
+        CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+        if(_is_quantized)
+        {
+            CLScheduler::get().enqueue(_output_stage_kernel);
+        }
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.run();
+        }
     }
 }
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if(_optimised_function != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _optimised_function->prepare();
+    }
+    else
+    {
+        if(!_is_prepared)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _original_weights->mark_as_unused();
+            // Run weights reshaping and mark original weights tensor as unused
+            _weights_reshaped.allocator()->allocate();
+            CLScheduler::get().enqueue(_weights_reshape_kernel);
+            CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+            _original_weights->mark_as_unused();
 
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+            CLScheduler::get().queue().finish();
+            _is_prepared = true;
+        }
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+        }
+    }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::DIV, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,32 +33,42 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
     bool flag = true;
 
     if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        // COMPMID-852
-        if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+        if((m > 1) && n < 16)
         {
-            constexpr float alpha = 3.2f;
-            constexpr float fact0 = 1.51f;
-            constexpr float fact1 = 1.66f;
-            constexpr float ops   = 12.0f;
-            const float     scale = k > 1024 ? 1.07f : 1.0f;
-            flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            flag = true;
         }
         else
         {
-            flag = false;
+            // COMPMID-852
+            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            }
+            else
+            {
+                flag = false;
+            }
         }
     }
     else
@@ -73,17 +83,19 @@
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _interleave_kernel(),
-      _transpose_kernel(),
       _mm_kernel(),
       _ma_kernel(),
+      _reshape_lhs_kernel(),
+      _reshape_rhs_kernel(),
+      _mm_reshaped_kernel(),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
       _is_interleaved_transposed(false),
       _run_addition(false),
       _reshape_b_only_on_first_run(false),
-      _is_prepared(false)
+      _is_prepared(false),
+      _is_new_gemm_reshaped(false)
 {
 }
 
@@ -106,29 +118,52 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _interleave_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int n                         = b->info()->dimension(0);
-    const int k                         = a->info()->dimension(0);
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
+    DataType           data_type                 = a->info()->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                         = b->info()->dimension(0);
+    const unsigned int k                         = a->info()->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->info()->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
 
     // Check if we need to reshape the matrix A and matrix B
     _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload   = static_cast<float>((m * n) / 20.0f);
+    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(_is_interleaved_transposed)
     {
@@ -145,19 +180,37 @@
         }
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+        if(_is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure interleave kernel
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            // Configure transpose kernel
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        }
     }
 
-    // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                        mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d),
-                         gemm_info.fp_mixed_precision());
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    if(!_is_new_gemm_reshaped)
+    {
+        // Configure and tune matrix multiply kernel
+        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+                             gemm_info.fp_mixed_precision());
+        CLScheduler::get().tune_kernel_static(_mm_kernel);
+    }
 
     if(_is_interleaved_transposed)
     {
@@ -170,7 +223,7 @@
     }
 
     // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         _ma_kernel.configure(c, output, beta);
         _run_addition = true;
@@ -197,13 +250,15 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int n                         = b->dimension(0);
-    const int k                         = a->dimension(0);
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    DataType           data_type                 = a->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                         = b->dimension(0);
+    const unsigned int k                         = a->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -211,9 +266,31 @@
         mult_interleave4x4_height = 2;
     }
 
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
+
     // Check if we need to reshape the matrix A and matrix B
     const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload             = static_cast<float>((m * n) / 20.0f);
+    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(run_interleave_transpose)
     {
@@ -227,19 +304,42 @@
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
-        // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        if(is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
+        }
+        else
+        {
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+            // Validate transpose kernel
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+        }
     }
 
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    if(!is_new_gemm_reshaped)
+    {
+        // Validate matrix multiply
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    }
 
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
-        CLScheduler::get().enqueue(_interleave_kernel, false);
+        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
 
         if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
         }
     }
 
     // Run matrix multiply kernel
-    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    if(_is_new_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    }
 
     // Run matrix addition kernel
     if(_run_addition)
@@ -286,10 +393,11 @@
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             _original_b->mark_as_unused();
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
       _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
-      _is_activationlayer_enabled(false), _is_prepared(false)
+      _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
 {
 }
 
@@ -101,7 +101,8 @@
                                           int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+                                           _run_addition));
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
 Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -156,8 +159,10 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = run_addition || !skip_im2col;
         // Perform validation step on Matrix multiply function
-        return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
@@ -193,6 +198,8 @@
     _skip_col2im                = data_layout == DataLayout::NHWC;
     _append_bias                = (biases != nullptr) && (!_is_quantized);
     _is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
     else if(_append_bias)
     {
         // Configure add bias kernel
-        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+        _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
     }
 
     // Create GEMM output tensor
@@ -276,9 +283,9 @@
     {
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
-        const float multiplier  = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
         int min_activation = 0;
@@ -375,6 +382,8 @@
     const bool skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
     const bool skip_col2im                = data_layout == DataLayout::NHWC;
     bool       is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
-    else if(append_bias)
+    else if(run_addition)
     {
         // Validate add bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
     }
 
     // Create GEMM output tensor
@@ -459,9 +468,9 @@
     {
         const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
 
-        const float multiplier  = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
 
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 
@@ -496,7 +505,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -537,7 +546,7 @@
         _mm_gemm.run();
     }
 
-    if(_skip_im2col && _append_bias)
+    if(_run_addition)
     {
         CLScheduler::get().enqueue(_add_bias_kernel);
     }

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,42 +31,25 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    bool flag = true;
-
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
-    {
-        // COMPMID-852
-        if(k > 256 && m > 4 && reshape_b_only_on_first_run)
-        {
-            flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
-        }
-        else
-        {
-            flag = false;
-        }
-    }
-    else
-    {
-        flag = m > 1;
-    }
-
-    return flag;
+    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
+      _mm_reshaped_kernel(),
       _mtx_a_reshape_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
       _original_b(nullptr),
       _a_offset(0),
       _b_offset(0),
-      _is_interleaved_transposed(true),
+      _is_gemm_reshaped(true),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _fuse_output_stage(false)
@@ -108,23 +91,23 @@
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const bool    unroll_block              = dot8_supported(CLKernelLibrary::get().get_device());
-    const int     m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int     n                         = b->info()->dimension(0);
-    const int     k                         = a->info()->dimension(0);
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
     // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+    _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
         reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
             _memory_group.manage(&_tmp_b);
         }
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
 
         _memory_group.manage(&_mm_result_s32);
 
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                              mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                              depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
     }
     else
     {
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                     mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                     depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
     }
 
     // Allocate tensors
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         _tmp_a.allocator()->allocate();
         if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
+    TensorInfo        tmp_a_info{};
+    TensorInfo        tmp_b_info{};
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int     m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int     n                         = b->dimension(0);
-    const int     k                         = a->dimension(0);
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+    bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(reshape_matrices)
@@ -252,20 +251,24 @@
         reinterpret_input_as_3d = false;
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
     if(reshape_matrices)
     {
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
         // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
     {
         TensorInfo mm_result_s32_info{};
 
-        // Output tensor auto inizialitation if not yet initialized
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+        if(reshape_matrices)
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
 
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
     }
     else
     {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+        if(reshape_matrices)
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
                                                                                  a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
 
     _memory_group.acquire();
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
     }
 
     // Run matrix multiply
-    CLScheduler::get().enqueue(_mm_kernel, false);
+    if(_is_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, false);
+    }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
@@ -374,7 +401,7 @@
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
         {
             ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
 
@@ -395,3 +422,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+    k->configure(input, indices, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@
       _memset_kernel(),
       _padded_copy_kernel(),
       _cpp_nms_kernel(),
+      _is_nhwc(false),
       _deltas_permuted(),
       _deltas_flattened(),
       _scores_permuted(),
@@ -60,10 +61,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
 
+    _is_nhwc                         = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType data_type         = deltas->info()->data_type();
-    const int      num_anchors       = scores->info()->dimension(2);
-    const int      feat_width        = scores->info()->dimension(0);
-    const int      feat_height       = scores->info()->dimension(1);
+    const int      num_anchors       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int      feat_width        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int      feat_height       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
     const int      total_num_anchors = num_anchors * feat_width * feat_height;
     const int      pre_nms_topN      = info.pre_nms_topN();
     const int      post_nms_topN     = info.post_nms_topN();
@@ -77,21 +79,37 @@
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
 
     // Permute and reshape deltas
-    _memory_group.manage(&_deltas_permuted);
-    _memory_group.manage(&_deltas_flattened);
-    _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
-    _deltas_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_deltas_permuted);
+        _memory_group.manage(&_deltas_flattened);
+        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _deltas_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_deltas_flattened);
+        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+    }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
     _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
 
     // Permute and reshape scores
-    _memory_group.manage(&_scores_permuted);
-    _memory_group.manage(&_scores_flattened);
-    _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
-    _scores_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_scores_permuted);
+        _memory_group.manage(&_scores_flattened);
+        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _scores_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_scores_flattened);
+        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+    }
 
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
                                           const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
 
-    const int num_anchors       = scores->dimension(2);
-    const int feat_width        = scores->dimension(0);
-    const int feat_height       = scores->dimension(1);
+    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -156,14 +175,21 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
     TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if(scores->data_layout() == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+    }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
     TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
@@ -236,9 +262,12 @@
     CLScheduler::get().enqueue(_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+    if(!_is_nhwc)
+    {
+        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+    }
     CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
-    CLScheduler::get().enqueue(_permute_scores_kernel, false);
     CLScheduler::get().enqueue(_flatten_scores_kernel, false);
 
     // Build the boxes

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -32,8 +32,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -81,3 +81,4 @@
 
     _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -110,9 +110,9 @@
     _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
     _forget_gate_out2.allocator()->allocate();
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _forget_gate_out1.allocator()->allocate();
     CLTensor *forget_gate_out = &_forget_gate_out5;
-
     if(lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    CLTensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -160,17 +161,23 @@
         _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        input_gate_out = &_input_gate_out4;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out5);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out4.allocator()->allocate();
             _input_gate_out5.allocator()->allocate();
+            input_gate_out = &_input_gate_out1;
         }
-        _input_gate_out3.allocator()->allocate();
-        _input_gate_out4.allocator()->allocate();
-        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        else
+        {
+            _input_gate_out1.allocator()->allocate();
+        }
+        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -190,14 +197,13 @@
     _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _forget_gate_out1.allocator()->allocate();
-    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
@@ -223,7 +229,7 @@
     _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
     _output2.allocator()->allocate();
     _memory_group.manage(&_output5);
-    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+    _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
     _output3.allocator()->allocate();
     CLTensor *output_gate_out = &_output5;
     if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
     std::vector<ICLTensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
-        scratch_inputs.emplace_back(&_input_gate_out1);
+        scratch_inputs.emplace_back(input_gate_out);
     }
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
-    _input_gate_out1.allocator()->allocate();
+    input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
     output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -544,4 +550,4 @@
     _concat_scratch_buffer.run();
 
     _memory_group.release();
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "support/ToolchainSupport.h"

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@
     _norm_kernel.configure(input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,21 +34,21 @@
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
 {
     // Copy the input to the output
     _copy_kernel.configure(input, output, padding);
 
     // Set the pages of the output to zero
-    _memset_kernel.configure(output, PixelValue());
+    _memset_kernel.configure(output, constant_value);
 
     // Fill padding on the first two dimensions with zeros
-    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
 }
 
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
     ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
 
     return Status{};

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -60,7 +60,7 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
 
     return Status{};
@@ -90,7 +90,7 @@
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
 
         _is_prepared = true;
     }
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 
 using namespace arm_compute;
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();

diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(output, start, end, step);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    return CLRangeKernel::validate(output, start, end, step);
+}

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,22 +45,31 @@
     _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims         = keep_dims;
 
+    Coordinates axis_local = reduction_axis;
+    const int   input_dims = input->info()->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
+    }
+
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-        out_shape.set(reduction_axis[i], 1);
+        out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -77,11 +86,10 @@
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
-        Coordinates axis_copy = reduction_axis;
-        std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+        std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
         for(unsigned int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_copy[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+    TensorShape out_shape = input->tensor_shape();
+
+    Coordinates        axis_sorted   = reduction_axis;
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+        axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+    }
+
+    std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
         if(output->total_size() > 0 && keep_dims)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
         }
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+        if(keep_dims)
+        {
+            out_shape.set(axis_sorted[i], 1);
+        }
+        else
+        {
+            out_shape.remove_dimension(axis_sorted[i] - i);
+        }
     }
 
+    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
     return Status{};
 }
 

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,19 @@
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
 {
 }
 
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
-    if(axis == 0 && !is_data_type_quantized(input->data_type()))
+    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    if(is_serial)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+    }
+    else
     {
         // Create temporary tensor infos
         auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
         }
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
     }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
-    }
 
     return Status{};
 }
@@ -122,65 +130,77 @@
 {
     _num_of_stages  = calculate_number_of_stages(input->info(), axis);
     _reduction_axis = axis;
-    _is_quantized   = is_data_type_quantized(input->info()->data_type());
+    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
     _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
 
     // Create temporary tensors
-    if(axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+    }
+    else
     {
         _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _sums_vector            = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
             shape.set(0, ceil(shape.x() / 128.f));
-            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_sums_vector.get());
+        _memory_group.manage(_results_vector.get());
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
+        PixelValue         pixelValue;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
+                pixelValue             = PixelValue();
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                pixelValue             = PixelValue();
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
+                pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
-        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_sums_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
-            _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-            _sums_vector[i - 1].allocator()->allocate();
+            _memory_group.manage(_results_vector.get() + i);
+            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
-        _sums_vector[last_stage - 1].allocator()->allocate();
-    }
-    else
-    {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
@@ -188,7 +208,11 @@
 {
     _memory_group.acquire();
 
-    if(_reduction_axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+    }
+    else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
@@ -196,10 +220,6 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-    else
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
-    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+    k->configure(input, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+    k->configure(c, x, y, output);
+    _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp

@@ -36,10 +36,10 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
@@ -54,8 +54,8 @@
     }));
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
-    return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp

@@ -33,20 +33,19 @@
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape, paddings, output);
 }
 
@@ -57,42 +56,35 @@
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
                                      const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
 }
 
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    // TODO(micspy01): replace with memset once ready
     if(_has_padding)
     {
-        _output->map(CLScheduler::get().queue(), true);
-        if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-        {
-            const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-            std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-        }
-        else
-        {
-            memset(_output->buffer(), 0, _output->info()->total_size());
-        }
-        _output->unmap(CLScheduler::get().queue());
+        CLScheduler::get().enqueue(_memset_kernel, true);
     }
-
     CLScheduler::get().enqueue(_space_to_batch_kernel, true);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+    : _input(),
+      _stack_kernels(),
+      _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+    _num_inputs    = input.size();
+    _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+    }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+    // Wrap around negative values
+    const size_t       rank   = input[0]->num_dimensions();
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+    const unsigned int num_inputs = input.size();
+
+    for(unsigned int i = 0; i < num_inputs; i++)
+    {
+        // All the tensors must have the same rank
+        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+        // Validate Kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+    }
+
+    return Status{};
+}
+
+void CLStackLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_stack_kernels[i], false);
+    }
+}

diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+    k->configure(input, output, multiples);
+    _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp

@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+    return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+    // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+    Coordinates slice_end;
+    slice_start.set_num_dimensions(input_num_dimensions);
+    slice_end.set_num_dimensions(input_num_dimensions);
+    for(size_t k = 0; k < input_num_dimensions; ++k)
+    {
+        slice_start.set(k, 0);
+        slice_end.set(k, -1);
+    }
+    slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+    : _num_slices(0),
+      _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+    std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_axis(axis, input->info());
+    _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    {
+        // Adjusts start and end coordinates to take a 2D slice at a time
+        slice_start.set(axis_u, slice);
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+    }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+    const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    for(size_t k = 0; k < num_slices; ++k)
+    {
+        slice_start.set(wrap_axis(axis, input), k);
+        setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+    }
+    return Status{};
+}
+
+void CLUnstack::run()
+{
+    for(unsigned i = 0; i < _num_slices; ++i)
+    {
+        _strided_slice_vector[i].run();
+    }
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorInfo        tmp_output_info = *output->clone();
+    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     switch(num_inputs)
@@ -90,7 +90,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -84,8 +84,8 @@
 } // namespace
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+      _is_prepared(false)
 {
 }
 
@@ -133,14 +133,7 @@
                                                                                                  (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
-    // Configure activation layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
+    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
@@ -216,11 +209,6 @@
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-
     _memory_group.release();
 }
 

diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
     k->configure(input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)

diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
new file mode 100644
index 0000000..cd97849
--- /dev/null
+++ b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                        bool lhs_interleave, bool rhs_interleave)
+{
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Configure GEMMLHSMatrixInfo
+    lhs_info.m0         = m0;
+    lhs_info.k0         = k0;
+    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+    lhs_info.interleave = lhs_interleave;
+    lhs_info.transpose  = false;
+
+    // Configure GEMMRHSMatrixInfo
+    rhs_info.n0         = n0;
+    rhs_info.k0         = lhs_info.k0;
+    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+    rhs_info.interleave = rhs_interleave;
+    rhs_info.transpose  = true;
+
+    return std::make_pair(lhs_info, rhs_info);
+}
+
+} // namespace
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(gpu_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(n <= 4)
+        {
+            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
+        }
+        else
+        {
+            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
+        }
+    }
+    else
+    {
+        if(n <= 4)
+        {
+            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
+        }
+        else
+        {
+            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 2b179fd..5916bb4 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -190,15 +190,19 @@
             return;
         }
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         try
         {
+#endif /* ARM_COMPUTE_EXCEPTIONS_ENABLED */
             process_workloads(*_workloads, *_feeder, _info);
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
         catch(...)
         {
             _current_exception = std::current_exception();
         }
-
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         _job_complete = true;
         lock.unlock();
         _cv.notify_one();
@@ -250,18 +254,21 @@
 
     info.thread_id = t;
     process_workloads(workloads, feeder, info);
-
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         for(auto &thread : _threads)
         {
             thread.wait();
         }
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::system_error &e)
     {
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
 #endif /* DOXYGEN_SKIP_THIS */
 

diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
new file mode 100644
index 0000000..79e619c
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp

@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+namespace arm_compute
+{
+namespace
+{
+Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
+
+    const int num_priors = input_priorbox->tensor_shape()[0] / 4;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
+    }
+
+    return Status{};
+}
+
+/** Function used to sort pair<float, T> in descend order based on the score (first) value.
+ */
+template <typename T>
+bool SortScorePairDescend(const std::pair<float, T> &pair1,
+                          const std::pair<float, T> &pair2)
+{
+    return pair1.first > pair2.first;
+}
+
+/** Get location predictions from input_loc.
+ *
+ * @param[in]  input_loc                The input location prediction.
+ * @param[in]  num                      The number of images.
+ * @param[in]  num_priors               number of predictions per class.
+ * @param[in]  num_loc_classes          number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[in]  share_location           If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
+                                  const int num_priors, const int num_loc_classes,
+                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+{
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_loc_classes; ++c)
+        {
+            int label = share_location ? -1 : c;
+            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            {
+                all_location_predictions[i][label].resize(num_priors);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(all_location_predictions[i][label].size() != static_cast<size_t>(num_priors));
+                break;
+            }
+        }
+    }
+    for(int i = 0; i < num; ++i)
+    {
+        for(int p = 0; p < num_priors; ++p)
+        {
+            for(int c = 0; c < num_loc_classes; ++c)
+            {
+                const int label    = share_location ? -1 : c;
+                const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
+                //xmin, ymin, xmax, ymax
+                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+            }
+        }
+    }
+}
+
+/** Get confidence predictions from input_conf.
+ *
+ * @param[in]  input_loc                The input location prediction.
+ * @param[in]  num                      The number of images.
+ * @param[in]  num_priors               Number of predictions per class.
+ * @param[in]  num_loc_classes          Number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
+                              const int num_priors, const int                 num_classes,
+                              std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
+{
+    std::vector<float> tmp_buffer;
+    tmp_buffer.resize(num * num_priors * num_classes);
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_classes; ++c)
+        {
+            for(int p = 0; p < num_priors; ++p)
+            {
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
+                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+            }
+        }
+    }
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_classes; ++c)
+        {
+            all_confidence_scores[i][c].resize(num_priors);
+            all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
+                                               &tmp_buffer[i * num_classes * num_priors + c * num_priors + num_priors]);
+        }
+    }
+}
+
+/** Get prior boxes from input_priorbox.
+ *
+ * @param[in]  input_priorbox           The input location prediction.
+ * @param[in]  num_priors               Number of priors.
+ * @param[in]  num_loc_classes          number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_prior_bboxes         If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_priorbox(const ITensor               *input_priorbox,
+                           const int                    num_priors,
+                           std::vector<NormalizedBBox> &all_prior_bboxes,
+                           std::vector<std::array<float, 4>> &all_prior_variances)
+{
+    for(int i = 0; i < num_priors; ++i)
+    {
+        all_prior_bboxes[i] =
+        {
+            {
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
+            }
+        };
+    }
+
+    std::array<float, 4> var({ { 0, 0, 0, 0 } });
+    for(int i = 0; i < num_priors; ++i)
+    {
+        for(int j = 0; j < 4; ++j)
+        {
+            var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
+        }
+        all_prior_variances[i] = var;
+    }
+}
+
+/** Decode a bbox according to a prior bbox.
+ *
+ * @param[in]  prior_bbox                 The input prior bounding boxes.
+ * @param[in]  prior_variance             The corresponding input variance.
+ * @param[in]  code_type                  The detection output code type used to decode the results.
+ * @param[in]  variance_encoded_in_target If true, the variance is encoded in target.
+ * @param[in]  clip_bbox                  If true, the results should be between 0.f and 1.f.
+ * @param[in]  bbox                       The input bbox to decode
+ * @param[out] decode_bbox                The decoded bboxes.
+ *
+ */
+void DecodeBBox(const NormalizedBBox &prior_bbox, const std::array<float, 4> &prior_variance,
+                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
+                const bool clip_bbox, const NormalizedBBox &bbox, NormalizedBBox &decode_bbox)
+{
+    // if the variance is encoded in target, we simply need to add the offset predictions
+    // otherwise we need to scale the offset accordingly.
+    switch(code_type)
+    {
+        case DetectionOutputLayerCodeType::CORNER:
+        {
+            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]);
+            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]);
+            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]);
+            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]);
+
+            break;
+        }
+        case DetectionOutputLayerCodeType::CENTER_SIZE:
+        {
+            const float prior_width  = prior_bbox[2] - prior_bbox[0];
+            const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+            // Check if the prior width and height are right
+            ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+            ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+            const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
+            const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
+
+            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+
+            decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
+            decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
+            decode_bbox[2] = (decode_bbox_center_x + decode_bbox_width / 2.f);
+            decode_bbox[3] = (decode_bbox_center_y + decode_bbox_height / 2.f);
+
+            break;
+        }
+        case DetectionOutputLayerCodeType::CORNER_SIZE:
+        {
+            const float prior_width  = prior_bbox[2] - prior_bbox[0];
+            const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+            // Check if the prior width and height are greater than 0
+            ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+            ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
+    }
+
+    if(clip_bbox)
+    {
+        for(auto &d_bbox : decode_bbox)
+        {
+            d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
+        }
+    }
+}
+
+/** Do non maximum suppression given bboxes and scores.
+ *
+ * @param[in]  bboxes          The input bounding boxes.
+ * @param[in]  scores          The corresponding input confidence.
+ * @param[in]  score_threshold The threshold used to filter detection results.
+ * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+ * @param[in]  eta             Adaptation rate for nms threshold.
+ * @param[in]  top_k           If not -1, keep at most top_k picked indices.
+ * @param[out] indices         The kept indices of bboxes after nms.
+ *
+ */
+void ApplyNMSFast(const std::vector<NormalizedBBox> &bboxes,
+                  const std::vector<float> &scores, const float score_threshold,
+                  const float nms_threshold, const float eta, const int top_k,
+                  std::vector<int> &indices)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
+
+    // Get top_k scores (with corresponding indices).
+    std::list<std::pair<float, int>> score_index_vec;
+
+    // Generate index score pairs.
+    for(size_t i = 0; i < scores.size(); ++i)
+    {
+        if(scores[i] > score_threshold)
+        {
+            score_index_vec.emplace_back(std::make_pair(scores[i], i));
+        }
+    }
+
+    // Sort the score pair according to the scores in descending order
+    score_index_vec.sort(SortScorePairDescend<int>);
+
+    // Keep top_k scores if needed.
+    const int score_index_vec_size = score_index_vec.size();
+    if(top_k > -1 && top_k < score_index_vec_size)
+    {
+        score_index_vec.resize(top_k);
+    }
+
+    // Do nms.
+    float adaptive_threshold = nms_threshold;
+    indices.clear();
+
+    while(!score_index_vec.empty())
+    {
+        const int idx  = score_index_vec.front().second;
+        bool      keep = true;
+        for(int kept_idx : indices)
+        {
+            if(keep)
+            {
+                // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+                NormalizedBBox intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                {
+                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                }
+                else
+                {
+                    intersect_bbox = std::array<float, 4>({ {
+                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
+                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
+                        }
+                    });
+                }
+
+                float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
+                float intersect_height = intersect_bbox[3] - intersect_bbox[1];
+
+                float overlap = 0.f;
+                if(intersect_width > 0 && intersect_height > 0)
+                {
+                    float intersect_size = intersect_width * intersect_height;
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
+                                            || bboxes[idx][3] < bboxes[idx][1]) ?
+                                           0.f :
+                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
+                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
+                                       0.f :
+                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
+                }
+                keep = (overlap <= adaptive_threshold);
+            }
+            else
+            {
+                break;
+            }
+        }
+        if(keep)
+        {
+            indices.push_back(idx);
+        }
+        score_index_vec.erase(score_index_vec.begin());
+        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        {
+            adaptive_threshold *= eta;
+        }
+    }
+}
+
+Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+                                              const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
+
+    return Status{};
+}
+} // namespace
+
+CPPNonMaximumSuppression::CPPNonMaximumSuppression()
+    : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
+{
+}
+
+void CPPNonMaximumSuppression::configure(
+    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
+
+    // copy scores also to a vector
+    _bboxes  = bboxes;
+    _scores  = scores;
+    _indices = indices;
+
+    _nms_threshold   = nms_threshold;
+    _max_output_size = max_output_size;
+    _score_threshold = score_threshold;
+}
+
+Status CPPNonMaximumSuppression::validate(
+    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
+    return Status{};
+}
+
+void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
+{
+    Window input_win;
+    input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
+    input_win.set_dimension_step(0U, 4U);
+    input_win.set_dimension_step(1U, 1U);
+    Iterator input(bboxes, input_win);
+    auto     f = [&bboxes_vector, &input](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+        bboxes_vector.push_back(NormalizedBBox({ { *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) } }));
+    };
+    execute_window_loop(input_win, f, input);
+}
+
+void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
+{
+    Window window;
+    window.use_tensor_dimensions(scores->info()->tensor_shape());
+    Iterator it(scores, window);
+    auto     f = [&it, &scores_vector](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
+        scores_vector.push_back(*input_ptr);
+    };
+    execute_window_loop(window, f, it);
+}
+
+void CPPNonMaximumSuppression::run()
+{
+    std::vector<NormalizedBBox> bboxes_vector;
+    std::vector<float>          scores_vector;
+    std::vector<int>            indices_vector;
+    extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
+    extract_scores_from_tensor(_scores, scores_vector);
+    ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
+    std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
+}
+
+CPPDetectionOutputLayer::CPPDetectionOutputLayer()
+    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
+      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+{
+}
+
+void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+    // Output auto initialization if not yet initialized
+    // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
+    // The maximum is keep_top_k * input_loc_size[1]
+    // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
+    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+
+    _input_loc      = input_loc;
+    _input_conf     = input_conf;
+    _input_priorbox = input_priorbox;
+    _output         = output;
+    _info           = info;
+    _num_priors     = input_priorbox->info()->dimension(0) / 4;
+    _num            = (_input_loc->info()->num_dimensions() > 1 ? _input_loc->info()->dimension(1) : 1);
+
+    _all_location_predictions.resize(_num);
+    _all_confidence_scores.resize(_num);
+    _all_prior_bboxes.resize(_num_priors);
+    _all_prior_variances.resize(_num_priors);
+    _all_decode_bboxes.resize(_num);
+
+    for(int i = 0; i < _num; ++i)
+    {
+        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        {
+            const int label = _info.share_location() ? -1 : c;
+            if(label == _info.background_label_id())
+            {
+                // Ignore background class.
+                continue;
+            }
+            _all_decode_bboxes[i][label].resize(_num_priors);
+        }
+    }
+    _all_indices.resize(_num);
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+}
+
+Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+    return Status{};
+}
+
+void CPPDetectionOutputLayer::run()
+{
+    // Retrieve all location predictions.
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+
+    // Retrieve all confidences.
+    retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
+
+    // Retrieve all prior bboxes.
+    retrieve_all_priorbox(_input_priorbox, _num_priors, _all_prior_bboxes, _all_prior_variances);
+
+    // Decode all loc predictions to bboxes
+    const bool clip_bbox = false;
+    for(int i = 0; i < _num; ++i)
+    {
+        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        {
+            const int label = _info.share_location() ? -1 : c;
+            if(label == _info.background_label_id())
+            {
+                // Ignore background class.
+                continue;
+            }
+            ARM_COMPUTE_ERROR_ON_MSG(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+
+            const std::vector<NormalizedBBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
+
+            const int num_bboxes = _all_prior_bboxes.size();
+            ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
+
+            for(int j = 0; j < num_bboxes; ++j)
+            {
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+            }
+        }
+    }
+
+    int num_kept = 0;
+
+    for(int i = 0; i < _num; ++i)
+    {
+        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+
+        std::map<int, std::vector<int>> indices;
+        int num_det = 0;
+        for(int c = 0; c < _info.num_classes(); ++c)
+        {
+            if(c == _info.background_label_id())
+            {
+                // Ignore background class
+                continue;
+            }
+            const int label = _info.share_location() ? -1 : c;
+            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            {
+                ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+            }
+            const std::vector<float>          &scores = conf_scores.find(c)->second;
+            const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(label)->second;
+
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+
+            num_det += indices[c].size();
+        }
+
+        int num_to_add = 0;
+        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        {
+            std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+            for(auto it : indices)
+            {
+                const int               label         = it.first;
+                const std::vector<int> &label_indices = it.second;
+
+                if(conf_scores.find(label) == conf_scores.end())
+                {
+                    ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+                }
+
+                const std::vector<float> &scores = conf_scores.find(label)->second;
+                for(auto idx : label_indices)
+                {
+                    ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
+                    score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                }
+            }
+
+            // Keep top k results per image.
+            std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<std::pair<int, int>>);
+            score_index_pairs.resize(_info.keep_top_k());
+
+            // Store the new indices.
+
+            std::map<int, std::vector<int>> new_indices;
+            for(auto score_index_pair : score_index_pairs)
+            {
+                int label = score_index_pair.second.first;
+                int idx   = score_index_pair.second.second;
+                new_indices[label].push_back(idx);
+            }
+            _all_indices[i] = new_indices;
+            num_to_add      = _info.keep_top_k();
+        }
+        else
+        {
+            _all_indices[i] = indices;
+            num_to_add      = num_det;
+        }
+        num_kept += num_to_add;
+    }
+
+    //Update the valid region of the ouput to mark the exact number of detection
+    _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
+
+    int count = 0;
+    for(int i = 0; i < _num; ++i)
+    {
+        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+        for(auto &it : _all_indices[i])
+        {
+            const int                 label     = it.first;
+            const std::vector<float> &scores    = conf_scores.find(label)->second;
+            const int                 loc_label = _info.share_location() ? -1 : label;
+            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            {
+                // Either if there are no confidence predictions
+                // or there are no location predictions for current label.
+                ARM_COMPUTE_ERROR("Could not find predictions for the label %d.", label);
+            }
+            const std::vector<NormalizedBBox> &bboxes  = decode_bboxes.find(loc_label)->second;
+            const std::vector<int>            &indices = it.second;
+
+            for(auto idx : indices)
+            {
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 2)))) = scores[idx];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 3)))) = bboxes[idx][0];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 4)))) = bboxes[idx][1];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 5)))) = bboxes[idx][2];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 6)))) = bboxes[idx][3];
+
+                ++count;
+            }
+        }
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
new file mode 100644
index 0000000..c4e1eab
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
+
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+    auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>();
+    kernel->configure(predictions, targets, output, k);
+    _kernel = std::move(kernel);
+}
+
+Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+    return CPPTopKVKernel::validate(predictions, targets, output, k);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index ac19d08..f3355a7 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp

@@ -39,7 +39,8 @@
 #include <unistd.h>
 
 #ifndef BARE_METAL
-#include <regex>
+/* C++ std::regex takes up a lot of space in the standalone builds */
+#include <regex.h>
 #include <thread>
 #endif /* BARE_METAL */
 
@@ -94,6 +95,7 @@
             return false;
     }
 }
+
 /* Convert an MIDR register value to a CPUModel enum value. */
 CPUModel midr_to_model(const unsigned int midr)
 {
@@ -144,6 +146,19 @@
                 break;
         }
     }
+    else if(implementer == 0x48) // HiSilicon CPUs
+    {
+        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
+        switch(cpunum)
+        {
+            case 0xd40: // A76 (Kirin 980)
+                model = CPUModel::GENERIC_FP16_DOT;
+                break;
+            default:
+                model = CPUModel::GENERIC;
+                break;
+        }
+    }
 
     return model;
 }
@@ -172,12 +187,27 @@
 
 void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
 {
+    regex_t proc_regex;
+    regex_t imp_regex;
+    regex_t var_regex;
+    regex_t part_regex;
+    regex_t rev_regex;
+
+    memset(&proc_regex, 0, sizeof(regex_t));
+    memset(&imp_regex, 0, sizeof(regex_t));
+    memset(&var_regex, 0, sizeof(regex_t));
+    memset(&part_regex, 0, sizeof(regex_t));
+    memset(&rev_regex, 0, sizeof(regex_t));
+
+    int ret_status = 0;
     // If "long-form" cpuinfo is present, parse that to populate models.
-    std::regex proc_regex(R"(^processor.*(\d+)$)");
-    std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
-    std::regex var_regex(R"(^CPU variant.*0x(.)$)");
-    std::regex part_regex(R"(^CPU part.*0x(...)$)");
-    std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
+    ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
+    ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
+    ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
+    ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
+    ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
+    ARM_COMPUTE_UNUSED(ret_status);
+    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     std::ifstream file;
     file.open("/proc/cpuinfo", std::ios::in);
@@ -190,11 +220,11 @@
 
         while(bool(getline(file, line)))
         {
-            std::smatch match;
-
-            if(std::regex_match(line, match, proc_regex))
+            regmatch_t match[2];
+            ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                std::string id     = match[1];
+                std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         newcpu = support::cpp11::stoi(id, nullptr);
 
                 if(curcpu >= 0 && midr == 0)
@@ -214,32 +244,44 @@
                 continue;
             }
 
-            if(std::regex_match(line, match, imp_regex))
+            ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int impv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (impv << 24);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, var_regex))
+            ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int varv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (varv << 20);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, part_regex))
+            ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int partv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (partv << 4);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, rev_regex))
+            ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int regv = support::cpp11::stoi(match[1], nullptr);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         regv   = support::cpp11::stoi(subexp, nullptr);
                 midr |= (regv);
                 midr |= (0xf << 16);
+
                 continue;
             }
         }
@@ -249,6 +291,13 @@
             cpusv[curcpu] = midr_to_model(midr);
         }
     }
+
+    // Free allocated memory
+    regfree(&proc_regex);
+    regfree(&imp_regex);
+    regfree(&var_regex);
+    regfree(&part_regex);
+    regfree(&rev_regex);
 }
 
 int get_max_cpus()
@@ -364,8 +413,11 @@
     std::map<std::string, unsigned int> cpu_part_occurrence_map;
 
     // CPU part regex
-    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
-    std::smatch cpu_part_match;
+    regex_t cpu_part_rgx;
+    memset(&cpu_part_rgx, 0, sizeof(regex_t));
+    int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
+    ARM_COMPUTE_UNUSED(ret_status);
+    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     // Read cpuinfo and get occurrence of each core
     std::ifstream cpuinfo;
@@ -375,9 +427,11 @@
         std::string line;
         while(bool(getline(cpuinfo, line)))
         {
-            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+            regmatch_t match[2];
+            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                std::string cpu_part = cpu_part_match[1];
+                std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
                 {
                     cpu_part_occurrence_map[cpu_part]++;
@@ -389,6 +443,7 @@
             }
         }
     }
+    regfree(&cpu_part_rgx);
 
     // Get min number of threads
     auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index c58d184..a35a18a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -170,7 +170,7 @@
     {
         BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
         input->info()->extend_padding(border_size);
-        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
+        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
     }
     // Configure im2col
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index 689d8be..aa937a6 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index d9aa50d..ba05838 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
     _kernel = std::move(k);
 
     // Configure border handler
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 
     _shift_handler.configure(input);
 

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index c0cf098..cb14b8a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,7 @@
         return;
     }
 
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 
     _shift_handler.configure(input);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index b2e69ee..2569365 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@
     _norm_kernel.configure(input, &_squared_input, output, norm_info);
     _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 
     // Allocate intermediate buffers
     _squared_input.allocator()->allocate();

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 7d928d6..97c20d1 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -59,7 +59,7 @@
     // Check if there is a free blob
     if(_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
     }
     else
     {
@@ -71,7 +71,7 @@
     _active_elements.insert(std::make_pair(obj, obj));
 }
 
-void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size, size_t alignment)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
 
@@ -80,10 +80,11 @@
     ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
 
     // Update object fields and mark object as complete
-    Element &el = active_object_it->second;
-    el.handle   = &obj_memory;
-    el.size     = size;
-    el.status   = true;
+    Element &el  = active_object_it->second;
+    el.handle    = &obj_memory;
+    el.size      = size;
+    el.alignment = alignment;
+    el.status    = true;
 
     // Find object in the occupied lists
     auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
@@ -94,8 +95,9 @@
 
     // Update occupied blob and return as free
     occupied_blob_it->bound_elements.insert(obj);
-    occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
-    occupied_blob_it->id       = nullptr;
+    occupied_blob_it->max_size      = std::max(occupied_blob_it->max_size, size);
+    occupied_blob_it->max_alignment = std::max(occupied_blob_it->max_alignment, alignment);
+    occupied_blob_it->id            = nullptr;
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all object are finalized and reset active group

diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
index ad00070..5ae1c2a 100644
--- a/src/runtime/MEMUtils.cpp
+++ b/src/runtime/MEMUtils.cpp

@@ -27,7 +27,7 @@
 
 #ifndef BARE_METAL
 #include <fstream>
-#include <regex>
+#include <iterator>
 #include <sstream>
 #endif // ifndef BARE_METAL
 
@@ -43,41 +43,33 @@
     size_t        memfree  = 0;
     std::ifstream meminfo_f;
     meminfo_f.open("/proc/meminfo", std::ios::in);
+
     if(meminfo_f.is_open())
     {
-        std::stringstream str_stream;
-        str_stream << meminfo_f.rdbuf();
-        const std::string str = str_stream.str();
-        try
+        std::string line;
+        while(bool(getline(meminfo_f, line)))
         {
-            std::smatch match;
-            if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+            std::istringstream       iss(line);
+            std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)),
+                                            std::istream_iterator<std::string>());
+            if(tokens[0] == "MemTotal:")
             {
-                const std::string result = match.str(1);
-                total                    = std::stoul(result, nullptr, 0);
+                total = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "MemFree:")
             {
-                const std::string result = match.str(1);
-                memfree                  = std::stoul(result, nullptr, 0);
+                memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "Buffers:")
             {
-                const std::string result = match.str(1);
-                buffer                   = std::stoul(result, nullptr, 0);
+                buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "Cached:")
             {
-                const std::string result = match.str(1);
-                memcache                 = std::stoul(result, nullptr, 0);
+                memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            free = memfree + (buffer + memcache);
         }
-        catch(std::regex_error &e)
-        {
-            // failed parsing /proc/meminfo
-            // return 0s on all fields
-        }
+        free = memfree + (buffer + memcache);
     }
 #endif // ifndef BARE_METAL
 }

diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
new file mode 100644
index 0000000..1287204
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder() // NOLINT
+    : _kernel()
+{
+}
+
+void INESimpleFunctionNoBorder::run()
+{
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000..d33e134
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+    _reduction_kernel.configure(input, output, axis, op);
+
+    if(axis == 0)
+    {
+        _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+        _run_fill_border = true;
+    }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+    return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+    _memory_group.acquire();
+
+    if(_run_fill_border)
+    {
+        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    }
+    NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+    _memory_group.release();
+}
+
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 677e9f6..b155077 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,16 +36,6 @@
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
 }
 Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 931e5db..5059162 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,8 +148,7 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
-       || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+    if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
     {
         return ConvolutionMethod::GEMM;
     }

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 6887a0a..44d7197 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -145,6 +145,15 @@
     _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
     _scaled_output.allocator()->allocate();
 }
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+{
+    return NEDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0);
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+{
+    configure(input, weights, bias, output, info, 0, 0);
+}
 
 void NEDeconvolutionLayer::run()
 {

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a2f0094..f0fd4cf 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,7 +72,7 @@
             accum_layout = DataLayout::NCHW;
         }
 
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
         _accumulator.info()->set_data_layout(accum_layout);
         zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
@@ -271,7 +271,7 @@
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
 

diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
new file mode 100644
index 0000000..74c1957
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp

@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEDivisionOperationKernel::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+    k->configure(COP, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEComparisonOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+    k->configure(op, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+    return NEComparisonOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
new file mode 100644
index 0000000..10142c7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERsqrtLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+    k->configure(ElementWiseUnary::RSQRT, input, output);
+    _kernel = std::move(k);
+}
+Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output);
+}
+
+void NEExpLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+    k->configure(ElementWiseUnary::EXP, input, output);
+    _kernel = std::move(k);
+}
+Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
new file mode 100644
index 0000000..dc48731
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFuseBatchNormalization::NEFuseBatchNormalization()
+    : _fuse_bn_kernel()
+{
+}
+
+void NEFuseBatchNormalization::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+                                         ITensor *fused_weights, ITensor *fused_bias,
+                                         const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+                                         float epsilon)
+{
+    _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status NEFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                          float epsilon)
+{
+    return NEFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void NEFuseBatchNormalization::run()
+{
+    NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 72a3e80..914f088 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,8 @@
             shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
             shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+            TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
 
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 922f757..470e922 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
@@ -38,14 +35,14 @@
 {
 namespace
 {
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+                                                     const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
                                                      std::shared_ptr<IMemoryManager> memory_manager)
 
 {
     //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
-    switch(method)
+    switch(gemm_kernel_info.method)
     {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
         case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
         {
             if(!pretranspose_hint)
@@ -56,99 +53,41 @@
             function->configure(a, b, d, alpha, beta, pretranspose_hint);
             return std::move(function);
         }
-        default:
-            return nullptr;
-    }
-}
-
-template <typename TypeInput, typename TypeOutput>
-std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                           std::shared_ptr<IMemoryManager> memory_manager)
-{
-    ARM_COMPUTE_UNUSED(method);
-    ARM_COMPUTE_UNUSED(a);
-    ARM_COMPUTE_UNUSED(b);
-    ARM_COMPUTE_UNUSED(d);
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(pretranspose_hint);
-    ARM_COMPUTE_UNUSED(memory_manager);
-    return nullptr;
-}
-
-#ifdef __aarch64__
-template <>
-std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                            std::shared_ptr<IMemoryManager> memory_manager)
-{
-    switch(method)
-    {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
-        {
-            if(!pretranspose_hint)
-            {
-                return nullptr;
-            }
-            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
-            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
-            return std::move(function);
-        }
-        default:
-            return nullptr;
-    }
-    return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                              std::shared_ptr<IMemoryManager> memory_manager)
-{
-    switch(method)
-    {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
-        {
-            if(!pretranspose_hint)
-            {
-                return nullptr;
-            }
-            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
-            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
-            return std::move(function);
-        }
-        default:
-            return nullptr;
-    }
-    return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                         std::shared_ptr<IMemoryManager> memory_manager)
-{
-    ARM_COMPUTE_UNUSED(pretranspose_hint);
-    ARM_COMPUTE_UNUSED(memory_manager);
-    switch(method)
-    {
+#if defined(__aarch64__)
         case arm_gemm::GemmMethod::GEMM_NATIVE:
         {
-            auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
-            kernel->configure(a, b, d, alpha, beta);
-            auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
-            function->configure(std::move(kernel));
-            return std::move(function);
+            if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
+            {
+                auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+                kernel->configure(a, b, d, alpha, beta);
+                auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+                function->configure(std::move(kernel));
+                return std::move(function);
+            }
+            return nullptr;
         }
+#endif // defined(__aarch64__)
         default:
             return nullptr;
     }
 }
-#endif /* __aarch64__ */
 
 /** Fallback in case ACL doesn't have a function */
 template <typename TypeInput, typename TypeOutput>
 class Fallback : public NEGEMMAssemblyDispatch::IFallback
 {
 public:
-    void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+    /** Initialise the functions's input and output.
+     *
+     * @param[in]  a            Input tensor containing the Matrix A.
+     * @param[in]  b            Input tensor containing the Matrix B.
+     * @param[out] d            Output tensor to store the result of matrix multiplication.
+     * @param[in]  args         Matrix multiplication information.
+     * @param[in]  memory_group Memory group to be used by the function.
+     */
+    void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group);
+
+    // Inherited methods overridden:
     void run() override;
     void prepare() override;
     bool is_configured() const override;
@@ -187,9 +126,16 @@
 };
 
 template <typename TypeInput, typename TypeOutput>
-void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group)
 {
-    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+    arm_gemm::GemmConfig              gemm_cfg;
+    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
+    if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+    {
+        gemm_cfg.filter = gemm_kernel_info.name;
+        args._cfg       = &gemm_cfg;
+    }
+    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args);
     if(_gemm_kernel_asm == nullptr)
     {
         //configuration not supported: Leave function unconfigured:
@@ -199,7 +145,7 @@
     // arm_compute wrapper for the Gemm object (see above)
     std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
     ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
-    acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+    acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
     const size_t workspace_size = _gemm_kernel_asm->get_working_size();
     if(workspace_size > 0)
     {
@@ -229,8 +175,6 @@
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-        _pretranspose.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
     }
 }
 
@@ -242,6 +186,7 @@
         // Pretranspose B if required
         if(_gemm_kernel_asm->B_pretranspose_required())
         {
+            _pretranspose.allocator()->allocate();
             ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
             const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
             const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
@@ -335,12 +280,8 @@
     arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
 
     //Try to create an ACL function:
-    acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
-    // If the type agnostic factory failed to create an ACL function, try the specialised one:
-    if(acl_function == nullptr)
-    {
-        acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
-    }
+    acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager));
+
     //If we still don't have an ACL function:
     if(acl_function == nullptr)
     {

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 0232a83..be7cc2d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -90,7 +90,7 @@
 }
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
       _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
       _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 4b02694..5286f11 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,9 +97,9 @@
         else
         {
             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+            TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
             // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
+            TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
             _memory_group.manage(&_tmp_a);
@@ -241,8 +241,8 @@
             shape_tmp_b.set(0, b->dimension(1) * 16);
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
-            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+            TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+            TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
 
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
new file mode 100644
index 0000000..078bd5a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGather.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>();
+    k->configure(input, indices, output, axis);
+    _kernel = std::move(k);
+}
+
+Status NEGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    return NEGatherKernel::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb..8645b43 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,5 +36,5 @@
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index d0b80fb..56da966 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -26,8 +26,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -57,8 +57,8 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
 
-    // Reduce shape on axis (supported axis is 0)
-    shape.set(0, 1);
+    // Reduce shape on axis
+    shape.set(axis, 1);
     sum_sq.set_tensor_shape(shape);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
@@ -75,3 +75,4 @@
 
     _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 7c7580a..9e7a713 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,8 +111,8 @@
     _forget_gate_out2.allocator()->allocate();
     _memory_group.manage(&_forget_gate_out5);
     _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _forget_gate_out1.allocator()->allocate();
     Tensor *forget_gate_out = &_forget_gate_out5;
-
     if(lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,18 +129,18 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    forget_gate_out->allocator()->allocate();
+    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    Tensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(&_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -162,16 +162,22 @@
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
         _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        input_gate_out = &_input_gate_out4;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out5);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
             _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out4.allocator()->allocate();
             _input_gate_out5.allocator()->allocate();
+            input_gate_out = &_input_gate_out1;
         }
-        _input_gate_out3.allocator()->allocate();
-        _input_gate_out4.allocator()->allocate();
-        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        else
+        {
+            _input_gate_out1.allocator()->allocate();
+        }
+        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -194,11 +200,9 @@
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _input_gate_out1.allocator()->allocate();
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _forget_gate_out1.allocator()->allocate();
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
@@ -246,7 +250,6 @@
         _output1.allocator()->allocate();
     }
     _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    output_gate_out->allocator()->allocate();
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -265,6 +268,7 @@
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
     _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
+    output_gate_out->allocator()->allocate();
 
     if(lstm_params.has_projection())
     {
@@ -281,19 +285,22 @@
 
     // Copy cell state and output
     _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
-    _cell_state_out1.allocator()->allocate();
     _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<ITensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
-        scratch_inputs.emplace_back(&_input_gate_out1);
+        scratch_inputs.emplace_back(input_gate_out);
     }
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    input_gate_out->allocator()->allocate();
+    _cell_state_out1.allocator()->allocate();
+    forget_gate_out->allocator()->allocate();
+    output_gate_out->allocator()->allocate();
 }
 
 Status NELSTMLayer::validate(const ITensorInfo *input,

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
new file mode 100644
index 0000000..f5c2718
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
+{
+    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
+    const TensorInfo  expected_output_info  = input.clone()->set_tensor_shape(expected_output_shape);
+    return expected_output_info;
+}
+
+Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
+{
+    const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
+    auto_init_if_empty(output, expected_output_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
+
+    return Status{};
+}
+
+Coordinates get_subtensor_coords(const PaddingList &paddings)
+{
+    Coordinates coords;
+    for(unsigned int i = 0; i < paddings.size(); ++i)
+    {
+        coords.set(i, paddings[i].first);
+    }
+
+    return coords;
+}
+} // namespace
+
+NEPadLayer::NEPadLayer()
+    : _memset_kernel(), _copy_kernel(), _output_subtensor()
+{
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
+
+    // Auto-init
+    auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
+
+    // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
+    _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
+
+    // Set the pages of the output to the specified value
+    _memset_kernel.configure(output, constant_value);
+
+    // Copy the input to the output
+    _copy_kernel.configure(input, &_output_subtensor);
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+{
+    ARM_COMPUTE_UNUSED(constant_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    auto output_clone = output->clone();
+
+    SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+
+    return Status{};
+}
+
+void NEPadLayer::run()
+{
+    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 1f1400c..3aca4b7 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,14 @@
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEROIPoolingLayer::NEROIPoolingLayer()
     : _roi_kernel()
 {
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     _roi_kernel.configure(input, rois, output, pool_info);
 }
@@ -43,3 +43,4 @@
 {
     NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
new file mode 100644
index 0000000..977d502
--- /dev/null
+++ b/src/runtime/NEON/functions/NERange.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERange.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERange::NERange()
+    : _kernel()
+{
+}
+
+void NERange::configure(ITensor *output, const float start, const float end, const float step)
+{
+    _kernel.configure(output, start, end, step);
+}
+
+Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    return NERangeKernel::validate(output, start, end, step);
+}
+
+void NERange::run()
+{
+    NEScheduler::get().schedule(&_kernel, Window::DimX);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b022df..014895f 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,9 +14,9 @@
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
@@ -39,17 +39,38 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
-    {
-        if(output->total_size() > 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
-            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
-        }
+    TensorShape        out_shape     = input->tensor_shape();
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+    Coordinates        axis_local    = reduction_axis;
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
     }
 
+    std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+        if(output->total_size() > 0 && keep_dims)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+        }
+        if(keep_dims)
+        {
+            out_shape.set(axis_local[i], 1);
+        }
+        else
+        {
+            out_shape.remove_dimension(axis_local[i] - i);
+        }
+    }
+    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
     return Status{};
 }
 
@@ -62,22 +83,32 @@
     _reduced_outs      = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims         = keep_dims;
 
+    Coordinates        axis_local    = reduction_axis;
+    const int          input_dims    = input->info()->num_dimensions();
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
+    }
+
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-        out_shape.set(reduction_axis[i], 1);
+        out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -91,9 +122,13 @@
     if(!keep_dims)
     {
         TensorShape out_shape = input->info()->tensor_shape();
+
+        // We have to sort the reduction axis vectors in order for remove_dimension
+        // to work properly
+        std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
         for(unsigned int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(reduction_axis[i]);
+            out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 188c2bb..9f81a40 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,16 +50,6 @@
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
 }
-BorderMode reduction_operation_border_mode(ReductionOperation op)
-{
-    switch(op)
-    {
-        case ReductionOperation::SUM_SQUARE:
-            return BorderMode::CONSTANT;
-        default:
-            return BorderMode::CONSTANT;
-    }
-}
 } // namespace
 
 NEReductionOperation::NEReductionOperation()
@@ -86,9 +76,9 @@
     if(axis == 0)
     {
         // Configure fill border kernel
-        BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
-        BorderMode fill_border_mode = reduction_operation_border_mode(op);
-        _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+        const BorderSize fill_border_size = _reduction_kernel.border_size();
+        const PixelValue pixelValue       = (op == ReductionOperation::PROD) ? PixelValue(1, input->info()->data_type(), input->info()->quantization_info()) : PixelValue(0, input->info()->data_type());
+        _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
     }
 }
 

diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
new file mode 100644
index 0000000..139bd50
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReverse.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>();
+    k->configure(input, output, axis);
+    _kernel = std::move(k);
+}
+
+Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    return NEReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd..483aa4c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,11 @@
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
+    float sampling_offset = 0.0f;
+    if(sampling_policy == SamplingPolicy::CENTER)
+    {
+        sampling_offset = 0.5f;
+    }
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -60,8 +65,8 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
-            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
@@ -167,14 +172,14 @@
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
                          BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
 
     ITensorInfo *offsets = nullptr;

diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
new file mode 100644
index 0000000..509bbaa
--- /dev/null
+++ b/src/runtime/NEON/functions/NESelect.cpp

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESelect.h"
+
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>();
+    k->configure(c, x, y, output);
+    _kernel = std::move(k);
+}
+
+Status NESelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    return NESelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
new file mode 100644
index 0000000..03c2053
--- /dev/null
+++ b/src/runtime/NEON/functions/NESlice.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESlice.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Get absolute end coordinates
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+    _kernel = std::move(k);
+}
+
+Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+    // Check start dimensions for being non-negative
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+    {
+        return i < 0;
+    }));
+
+    // Get absolute end coordinates
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+    return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 9be9e68..36b7d47 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,54 +25,155 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/TypePrinter.h"
 
 #include <cfloat>
 
-using namespace arm_compute;
-
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
+namespace arm_compute
 {
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
+      _output_flattened(), _needs_flattening(false)
+{
+}
+
+void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+{
+    // Flatten the input
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+    // Initialize the flat input
+    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
+    // If flattening on the third axes, we use NEFlattenKernel.
+    // In all other cases we have to use NEReshapeKernel
+    if(axis != 3)
+    {
+        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+        reshape_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+    }
+    else
+    {
+        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+        flatten_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+    }
+
+    // We need to init the output tensor here. Indeed, the reshape kernel expects
+    // both tensors to be already initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
 void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
+    // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(axis);
+    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
 
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
-    _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
+    // We don't need flattening only in the case the input is 2D and axis is 1
+    _needs_flattening = axis != 1;
+
+    // If we are dealing with a 4D tensor, we will:
+    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+    // - Reshape the flattened output into the real output
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _input_flattened
+        _memory_group.manage(&_input_flattened);
+
+        // Configure  _flatten_kernel and _input_flattened
+        configure_reshape_input_kernel(input, output, axis);
+    }
+
+    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // or it is the original input case (2D case)
+    ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+
+    // Create intermediate tensors shapes
+    const TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
 
     // Init intermediate tensors
-    _max.allocator()->init(*_max.info());
-    _tmp.allocator()->init(*_tmp.info());
+    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    max_sum_shape.set(0, 1);
+    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
+    _tmp.allocator()->init(tensor_info_tmp);
 
     // Manage intermediate buffers
     _memory_group.manage(&_max);
     _memory_group.manage(&_tmp);
 
-    // Allocate intermediate tensors
+    // Configure Kernels
+    _max_kernel.configure(input_2D, &_max);
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _output_flattened
+        _memory_group.manage(&_output_flattened);
+
+        // The normalization kernel stores the result in a flat output tensor
+        _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
+        _input_flattened.allocator()->allocate();
+
+        // Reshape the flat output into the requested (4D) output
+        _reshape_kernel.configure(&_output_flattened, output);
+
+        // Allocate the intermediate flat tensors
+        _output_flattened.allocator()->allocate();
+    }
+    else
+    {
+        // Softmax 2D case
+        _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
+        _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+    }
+
+    // Allocate intermediate buffers
     _max.allocator()->allocate();
     _tmp.allocator()->allocate();
 }
 
 Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
-
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < 1 || input->num_dimensions() < axis);
 
-    const TensorShape max_shape           = TensorShape(input->tensor_shape()).set(0, 1);
-    const TensorInfo  tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
-    const TensorInfo  dont_care;
+    // Create intermediate tensor info
+    DataType         tmp_data_type = input->data_type();
+    const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+    TensorShape max_sum_shape = input->tensor_shape();
+    max_sum_shape.set(0, 1);
+    const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
+    const TensorInfo dont_care;
+
+    const bool needs_flattening = (axis != 1);
+
+    if(needs_flattening)
+    {
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+        if(axis != 3)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+        }
+    }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
@@ -81,9 +182,20 @@
 {
     _memory_group.acquire();
 
+    if(_needs_flattening)
+    {
+        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+    }
+
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
     NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
 
+    if(_needs_flattening)
+    {
+        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+    }
+
     _memory_group.release();
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
new file mode 100644
index 0000000..e947657
--- /dev/null
+++ b/src/runtime/NEON/functions/NESplit.cpp

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESplit.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NESplit::NESplit()
+    : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
+{
+    // Create Slice functions
+    _num_outputs     = outputs.size();
+    _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+    // Extract output tensor info
+    std::vector<ITensorInfo *> outputs_info;
+    for(auto &output : outputs)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+        outputs_info.emplace_back(output->info());
+    }
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(NESplit::validate(input->info(), outputs_info, axis));
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    for(unsigned int i = 0; i < _num_outputs; i++)
+    {
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        // Configure slice function
+        _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+        // Set valid region from shape
+        outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+        // Update axis offset
+        axis_offset += axis_split_step;
+    }
+}
+
+Status NESplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    // Validate output tensors
+    for(const auto &output : outputs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+        // Output auto inizialitation if not yet initialized
+        TensorInfo tmp_output_info = *output->clone();
+        auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(input, output, start_coords, end_coords));
+        axis_offset += axis_split_step;
+    }
+
+    return Status{};
+}
+
+void NESplit::run()
+{
+    for(unsigned i = 0; i < _num_outputs; ++i)
+    {
+        _slice_functions[i].run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
new file mode 100644
index 0000000..2f49c22
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "support/ToolchainSupport.h"
+namespace arm_compute
+{
+NEStackLayer::NEStackLayer() // NOLINT
+    : _input(),
+      _stack_kernels(),
+      _num_inputs(0)
+{
+}
+
+void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
+{
+    _num_inputs    = input.size();
+    _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+    }
+}
+
+Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+    // Wrap around negative values
+    const size_t       rank   = input[0]->num_dimensions();
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+    const unsigned int num_inputs = input.size();
+
+    for(unsigned int i = 0; i < num_inputs; i++)
+    {
+        // All the tensors must have the same rank
+        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+        // Validate Kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+    }
+
+    return Status{};
+}
+
+void NEStackLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+    }
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
new file mode 100644
index 0000000..53eb2b0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
+
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _kernel = std::move(k);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
new file mode 100644
index 0000000..0ca4413
--- /dev/null
+++ b/src/runtime/NEON/functions/NETile.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETile.h"
+
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NETileKernel>();
+    k->configure(input, output, multiples);
+    _kernel = std::move(k);
+}
+
+Status NETile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    return NETileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
new file mode 100644
index 0000000..7532020
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUnstack.cpp

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+    return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+    // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+    Coordinates slice_end;
+    slice_start.set_num_dimensions(input_num_dimensions);
+    slice_end.set_num_dimensions(input_num_dimensions);
+    for(size_t k = 0; k < input_num_dimensions; ++k)
+    {
+        slice_start.set(k, 0);
+        slice_end.set(k, -1);
+    }
+    slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+NEUnstack::NEUnstack() // NOLINT
+    : _num_slices(0),
+      _strided_slice_vector()
+{
+}
+
+void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
+{
+    std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_axis(axis, input->info());
+    _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    {
+        // Adjusts start and end coordinates to take a 2D slice at a time
+        slice_start.set(axis_u, slice);
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+    }
+}
+
+Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+
+    const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    for(size_t k = 0; k < num_slices; ++k)
+    {
+        slice_start.set(wrap_axis(axis, input), k);
+        setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+    }
+    return Status{};
+}
+
+void NEUnstack::run()
+{
+    for(unsigned i = 0; i < _num_slices; ++i)
+    {
+        _strided_slice_vector[i].run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 097605c..7e435c3 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
     _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
@@ -80,7 +80,7 @@
 
     _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
         width_offset += inputs_vector.at(i)->info()->dimension(0);
@@ -89,7 +89,7 @@
 
 void NEWidthConcatenateLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for(unsigned i = 0; i < _num_inputs; ++i)
     {
         NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
     }

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index c8e3b3b..e37f8ab 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -464,6 +464,7 @@
         transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
 
         //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+        _memory_group.manage(&_output_nhwc);
         transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, &_output_nhwc,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
@@ -483,16 +484,16 @@
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
 
-    _weights_hwio.allocator()->allocate();
     _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
     _input_workspace.allocator()->allocate();
-    _kernel_storage.allocator()->allocate();
     _output_workspace.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
-    _output_nhwc.allocator()->allocate();
+    if(data_layout == DataLayout::NCHW)
+    {
+        _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+        _output_nhwc.allocator()->allocate();
+    }
 
     _transform_input_kernel   = std::move(transform_input_kernel);
     _transform_weights_kernel = std::move(transform_weights_kernel);
@@ -656,10 +657,12 @@
     if(!_is_prepared)
     {
         // Permute weights
+        _weights_hwio.allocator()->allocate();
         _permute_weights.run();
         _weights->mark_as_unused();
 
         // Transform weights
+        _kernel_storage.allocator()->allocate();
         NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
 
         _weights_hwio.allocator()->free();

diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index c87e82a..34aaea0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,159 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
 namespace arm_compute
 {
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+    /** Number of buffers to ping pong between */
+    static constexpr unsigned int NUM_BUFFERS = 3;
+
+    explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+        : _max_num_users(max_num_users)
+    {
+    }
+    unsigned int num_buffers() const override
+    {
+        return NUM_BUFFERS;
+    }
+    /* - Lock the requested index if it's free and return true if it needs reshaping.
+     * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+     * - Block if the corresponding buffer for the given index is still being used by a different index.
+     */
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        while(true)
+        {
+            if(buf.index == index && buf.state != State::FREE)
+            {
+                //Another thread already is reshaping / has reshaped this block: nothing to do
+                return false;
+            }
+            else
+            {
+                std::unique_lock<std::mutex> lock(buf.mutex);
+                //If the buffer is free then lock it for reshaping:
+                if(buf.state == State::FREE)
+                {
+                    buf.index = index;
+                    buf.state = State::BEING_RESHAPED;
+                    return true;
+                }
+                // Check again just in case it changed while we were acquiring the lock:
+                if(buf.index == index)
+                {
+                    //Another thread is reshaping this block already, nothing to do
+                    return false;
+                }
+                // buf.index != index: Buffer still being used by another block, need to wait
+                buf.sem.wait(lock);
+            }
+        }
+    }
+    /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+    void mark_as_reshaped(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        {
+            std::lock_guard<std::mutex> lock(buf.mutex);
+            buf.users = _max_num_users;
+            buf.state = State::IN_USE;
+        }
+        buf.sem.notify_all();
+    }
+
+    /* Block until the buffer at the given index is reshaped */
+    void wait_for_reshaping(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        // Check if it's already ready to use:
+        if(buf.state == State::IN_USE)
+            return;
+        std::unique_lock<std::mutex> lock(buf.mutex);
+        //Double check it didn't change while we were acquiring the lock:
+        if(buf.state == State::IN_USE)
+            return;
+        buf.sem.wait(lock);
+    }
+    /* Mark the buffer at the given index as not used by this thread anymore.
+     * Once all the threads have called this method then the buffer is marked as free again.
+     */
+    void mark_as_unused(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        if(--buf.users == 0)
+        {
+            std::unique_lock<std::mutex> lock(buf.mutex);
+            buf.state = State::FREE;
+            lock.unlock();
+            buf.sem.notify_all();
+        }
+    }
+
+private:
+    enum class State
+    {
+        FREE,
+        BEING_RESHAPED,
+        IN_USE
+    };
+    struct Buffer
+    {
+        unsigned int            index{};
+        std::atomic_uint        users{};
+        State                   state{ State::FREE };
+        std::mutex              mutex{};
+        std::condition_variable sem{};
+    } _buffers[NUM_BUFFERS];
+    Buffer &get_buffer_from_index(unsigned int index)
+    {
+        return _buffers[index % NUM_BUFFERS];
+    }
+    unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+    unsigned int num_buffers() const override
+    {
+        return 1;
+    }
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        return true;
+    }
+    void mark_as_reshaped(unsigned int index) override
+    {
+    }
+    void wait_for_reshaping(unsigned int index) override
+    {
+    }
+    void mark_as_unused(unsigned int index) override
+    {
+    }
+};
+
 NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager))
 {
 }
+
 void NEGEMMInterleavedWrapper::run()
 {
     prepare();
@@ -53,6 +194,7 @@
     {
         if(_pretranspose_b)
         {
+            _transformed_b.allocator()->allocate();
             NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
             _b->mark_as_unused();
         }
@@ -65,12 +207,13 @@
 
         //Maximum number of workloads to create:
         const unsigned int num_threads    = NEScheduler::get().num_threads();
-        const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+        const unsigned int max_iterations = std::max(num_threads, _num_windows);
         //Maximum number of iterations the parameters allow:
         const unsigned int num_iterations = _batch_window.num_iterations_total();
         // Keep the smallest of the two:
         const unsigned int num_windows  = std::min(num_iterations, max_iterations);
         const TensorShape  window_shape = _batch_window.shape();
+        const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
 
         // Create a 1D window to dynamically split the batch window:
         Window win_1D;
@@ -79,66 +222,119 @@
         // Create one workload for each sub-window:
         for(unsigned int w = 0; w < num_windows; w++)
         {
-            Window             win          = win_1D.split_window(0, w, num_windows);
-            const Coordinates  start_offset = index2coords(window_shape, win.x().start());
-            const Coordinates  end_offset   = index2coords(window_shape, win.x().end() - 1);
-            const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+            Window            win          = win_1D.split_window(0, w, num_windows);
+            const Coordinates start_offset = index2coords(window_shape, win.x().start());
+            const Coordinates end_offset   = index2coords(window_shape, win.x().end() - 1);
 
-            auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+            if(_pretranspose_b)
             {
-                //For each block of rows in "M"
-                auto workload_mm = this->_mm_workloads.begin();
-                for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
                 {
-                    // Transform one k_block from A:
-                    this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
-                    // Then perform the matrix multiplication for each x block along N:
-                    for(unsigned int i = 0; i < num_x_blocks; i++)
+                    //For each block of rows in "M"
+                    auto workload_mm = this->_mm_workloads.begin();
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
                     {
-                        ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                        this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        }
                     }
-                }
-            };
-            _workloads.push_back(workload);
+                };
+                _workloads.push_back(workload);
+            }
+            else
+            {
+                auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+                {
+                    //For each block of rows in "M"
+                    auto         workload_mm = this->_mm_workloads.begin();
+                    unsigned int workload_b  = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                    {
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            if(workload_b_next < this->_b_workloads.size())
+                            {
+                                //Lock on BufferManager: need to run it ?
+                                if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                                {
+                                    this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                    this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                                }
+                                workload_b_next++;
+                            }
+                            ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                            // Run if needed or wait
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b);
+                            }
+                            this->_buffer_manager->wait_for_reshaping(workload_b);
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                            this->_buffer_manager->mark_as_unused(workload_b);
+                            workload_b++;
+                        }
+                    }
+                };
+                _workloads.push_back(workload);
+            }
+        }
+        if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+        {
+            //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+            for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+            {
+                auto workload = [this](const ThreadInfo & info)
+                {
+                    unsigned int workload_b = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = 1;
+
+                    for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+                    {
+                        if(workload_b_next < this->_b_workloads.size())
+                        {
+                            //Lock on BufferManager: need to run it ?
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                            }
+                            workload_b_next++;
+                        }
+                        ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                        // Run if needed or wait
+                        if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                        {
+                            this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                            this->_buffer_manager->mark_as_reshaped(workload_b);
+                        }
+                        this->_buffer_manager->wait_for_reshaping(workload_b);
+                        this->_buffer_manager->mark_as_unused(workload_b);
+                        workload_b++;
+                    }
+                };
+                _workloads.push_back(workload);
+            }
         }
 
         _is_prepared = true;
     }
 }
 
-namespace
-{
-// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params &params)
-{
-    auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
-    prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
-    return std::move(prepare_b);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params)
-{
-    auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
-    transform_a->configure(a, transformed_a, false, block_walker, params);
-    return std::move(transform_a);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, typename OutputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
-                                                                                    const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool pretranspose_b, float alpha, float beta)
-{
-    auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
-    matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
-    return std::move(matrix_multiply);
-}
-} // namespace
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b)
 {
     _params         = INEGEMMWrapperKernel::extract_parameters(a, b, c);
     _a              = a;
@@ -146,124 +342,80 @@
     _c              = c;
     _pretranspose_b = pretranspose_b;
 
-    DataType input_type = a->info()->data_type();
+    const DataType     input_type  = a->info()->data_type();
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+
+    const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b);
+    ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
 
     // Forcing 128-byte alignment (required by 32-bit kernels)
     const unsigned int alignment = 128;
     _transformed_b.allocator()->init(TensorInfo{}, alignment);
     _tmp_c.allocator()->init(TensorInfo{}, alignment);
-    _tag = "NEGEMMInterleaved_";
-    _tag += get_strategy_name(input_type, use_dot);
+    _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
+
+    // Get strategy
+    std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
+    _num_windows                                           = iceildiv(_params.M, strategy->out_height()) * _params.batches;
+    ARM_COMPUTE_ERROR_ON(strategy == nullptr);
 
     if(!_pretranspose_b)
     {
+        _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+        // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+        const unsigned int num_iterations = _batch_window.num_iterations_total();
+        if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+        {
+            _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+        }
+        else
+        {
+#ifdef NO_MULTI_THREADING
+            ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else  /* NO_MULTI_THREADING */
+            _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+        }
         // If B is transposed at every iteration then transformed_B can be managed:
         _memory_group.manage(&_transformed_b);
-        _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+        auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
     }
     else
     {
         _tag += "_preB";
-        switch(input_type)
-        {
-            case DataType::F32:
-                _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
-                break;
-#ifdef __aarch64__
-            case DataType::U8:
-            case DataType::QASYMM8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
-            case DataType::S8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-        }
-        ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+    }
 
+    _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
+    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+    if(_pretranspose_b)
+    {
         _block_sizes = _prepare_b->block_sizes();
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
     }
 
     _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
     _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
     _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
 
-    _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-    _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
     _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
     _memory_group.manage(&_transformed_a);
     _memory_group.manage(&_tmp_c);
 
-    switch(input_type)
-    {
-        case DataType::F32:
-            _transform_a     = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
-            _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            break;
-#ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(use_dot)
-            {
-                _transform_a     = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            else
-            {
-                _transform_a     = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            break;
-        case DataType::S8:
-            if(use_dot)
-            {
-                _transform_a     = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            else
-            {
-                _transform_a     = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _transform_a     = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
-            _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            break;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
+    _transform_a     = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params);
+    _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads);
     ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
     ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+
     _transformed_a.allocator()->allocate();
     _tmp_c.allocator()->allocate();
-    _transformed_b.allocator()->allocate();
+    if(!_pretranspose_b)
+    {
+        _transformed_b.allocator()->allocate();
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index d0b3bde..ad23220 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,16 @@
 #include <map>
 #include <vector>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
+namespace
+{
+size_t align_offset(size_t offset, size_t alignment)
+{
+    const size_t remainder = (alignment != 0U) ? offset % alignment : 0U;
+    return (remainder != 0U) ? offset + (alignment - remainder) : offset;
+}
+} // namespace
 OffsetLifetimeManager::OffsetLifetimeManager()
     : _blob(0)
 {
@@ -58,11 +66,15 @@
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Update blob size
-    size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
+    size_t max_aggregated_size = 0;
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
     {
-        return s + b.max_size;
+        max_aggregated_size += b.max_size;
+        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
     });
-    _blob = std::max(_blob, max_group_size);
+    max_aggregated_size += _free_blobs.size() * _blob.alignment;
+    _blob.owners = std::max(_blob.owners, _free_blobs.size());
+    _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
@@ -76,6 +88,8 @@
             group_mappings[bound_element.handle] = offset;
         }
         offset += free_blob.max_size;
-        ARM_COMPUTE_ERROR_ON(offset > _blob);
+        offset = align_offset(offset, _blob.alignment);
+        ARM_COMPUTE_ERROR_ON(offset > _blob.size);
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 36eaf0b..70cbe90 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,11 +34,11 @@
 
 using namespace arm_compute;
 
-OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, size_t blob_size)
-    : _allocator(allocator), _blob(), _blob_size(blob_size)
+OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
+    : _allocator(allocator), _blob(), _blob_info(blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!allocator);
-    _blob = _allocator->make_region(blob_size, 0);
+    _blob = _allocator->make_region(blob_info.size, blob_info.alignment);
 }
 
 void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -49,7 +49,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
+        handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
     }
 }
 
@@ -70,5 +70,5 @@
 std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_size);
+    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
 }
\ No newline at end of file

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5fa51d7..38edb8b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -138,7 +138,7 @@
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
     }
     info().set_is_resizable(false);
 }
commit	514be65ad8d3340f53fd9591035352ed285811ba	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu Feb 28 12:25:18 2019 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Thu Feb 28 13:38:08 2019 +0000
tree	abe236598d76078a537fd247813e287d5bf34acd
parent	3d2d44ef55ab6b08afda8be48301ce3c55c7bc67 [diff]